# Import des librairies
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
import dataprep.eda as eda
import pingouin as pg
import missingno as msno
bleu=(70/255, 130/255, 180/255)
pd.set_option('display.max_row', 60)
pd.set_option('display.max_column', 60)
C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\outdated\utils.py:14: OutdatedPackageWarning: The package outdated is out of date. Your version is 0.2.0, the latest is 0.2.1. Set the environment variable OUTDATED_IGNORE=1 to disable these warnings. return warn( C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\outdated\utils.py:14: OutdatedPackageWarning: The package pingouin is out of date. Your version is 0.3.10, the latest is 0.3.11. Set the environment variable OUTDATED_IGNORE=1 to disable these warnings. return warn(
Relevés de 2015
Buildings2015=pd.read_csv('2015-building-energy-benchmarking.csv')
meta_2015 = pd.read_json('socrata_metadata_2015-building-energy-benchmarking.json',
orient='index')
# Affichage du chargement
display(Buildings2015.shape,Buildings2015.head(2))
meta_2015
(3340, 47)
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | TaxParcelIdentificationNumber | Location | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEUI(kBtu/sf) | SiteEUIWN(kBtu/sf) | SourceEUI(kBtu/sf) | SourceEUIWN(kBtu/sf) | SiteEnergyUse(kBtu) | SiteEnergyUseWN(kBtu) | SteamUse(kBtu) | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | OtherFuelUse(kBtu) | GHGEmissions(MetricTonsCO2e) | GHGEmissionsIntensity(kgCO2e/ft2) | DefaultData | Comment | ComplianceStatus | Outlier | 2010 Census Tracts | Seattle Police Department Micro Community Policing Plan Areas | City Council Districts | SPD Beats | Zip Codes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2015 | NonResidential | Hotel | MAYFLOWER PARK HOTEL | 659000030 | {'latitude': '47.61219025', 'longitude': '-122... | 7 | DOWNTOWN | 1927 | 1 | 12.0 | 88434 | 0 | 88434 | Hotel | Hotel | 88434.0 | NaN | NaN | NaN | NaN | NaN | 65.0 | 78.9 | 80.3 | 173.5 | 175.1 | 6981428.0 | 7097539.0 | 2023032.0 | 1080307.0 | 3686160.0 | 12724.0 | 1272388.0 | 0.0 | 249.43 | 2.64 | No | NaN | Compliant | NaN | NaN | 14.0 | NaN | 31.0 | 18081 |
| 1 | 2 | 2015 | NonResidential | Hotel | PARAMOUNT HOTEL | 659000220 | {'latitude': '47.61310583', 'longitude': '-122... | 7 | DOWNTOWN | 1996 | 1 | 11.0 | 103566 | 15064 | 88502 | Hotel, Parking, Restaurant | Hotel | 83880.0 | Parking | 15064.0 | Restaurant | 4622.0 | NaN | 51.0 | 94.4 | 99.0 | 191.3 | 195.2 | 8354235.0 | 8765788.0 | 0.0 | 1144563.0 | 3905411.0 | 44490.0 | 4448985.0 | 0.0 | 263.51 | 2.38 | No | NaN | Compliant | NaN | NaN | 14.0 | NaN | 31.0 | 18081 |
| 0 | |
|---|---|
| id | h7rm-fz6m |
| name | 2015 Building Energy Benchmarking |
| assetType | dataset |
| attribution | City of Seattle |
| attributionLink | http://www.seattle.gov/environment |
| averageRating | 0 |
| createdAt | 1492461362 |
| description | Seattle’s Building Energy Benchmarking and Rep... |
| displayType | table |
| downloadCount | 2546 |
| hideFromCatalog | False |
| hideFromDataJson | False |
| indexUpdatedAt | 1551379229 |
| licenseId | PUBLIC_DOMAIN |
| newBackend | True |
| numberOfComments | 0 |
| oid | 28249767 |
| provenance | official |
| publicationAppendEnabled | False |
| publicationDate | 1521489624 |
| publicationGroup | 14152737 |
| publicationStage | published |
| rowsUpdatedAt | 1521489255 |
| rowsUpdatedBy | 9yyf-ecsx |
| tableId | 14945997 |
| totalTimesRated | 0 |
| viewCount | 4217 |
| viewLastModified | 1557955129 |
| viewType | tabular |
| approvals | [{'reviewedAt': 1521489624, 'reviewedAutomatic... |
| columns | [{'id': 350402298, 'name': 'OSEBuildingID', 'd... |
| grants | [{'inherited': False, 'type': 'viewer', 'flags... |
| license | {'name': 'Public Domain'} |
| metadata | {'rdfSubject': '0', 'jsonQuery': {'order': [{'... |
| owner | {'id': '9yyf-ecsx', 'displayName': 'Sullivan, ... |
| query | {'orderBys': [{'ascending': True, 'expression'... |
| rights | [read] |
| tableAuthor | {'id': '9yyf-ecsx', 'displayName': 'Sullivan, ... |
| tags | [building, eui, benchmarking, energy, electric... |
| flags | [default, restorable, restorePossibleForType] |
# Création d'un dataframe contenant les métadonnées
meta_data_2015=pd.DataFrame(meta_2015.T['columns'][0])
meta_data_2015
| id | name | dataTypeName | description | fieldName | position | renderTypeName | tableColumnId | width | cachedContents | format | computationStrategy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 350402298 | OSEBuildingID | number | A unique identifier assigned to each property ... | seattlebuildingid | 1 | number | 46302822 | 182.0 | {'largest': '50059', 'non_null': 3340, 'averag... | {'precisionStyle': 'standard', 'noCommas': 'tr... | NaN |
| 1 | 350402299 | DataYear | text | NaN | datayear | 2 | text | 46302823 | 196.0 | {'largest': '2015', 'non_null': 3340, 'null': ... | {} | NaN |
| 2 | 350402300 | BuildingType | text | City of Seattle building type classification. | buildingtype | 3 | text | 46302824 | 244.0 | {'largest': 'SPS-District K-12', 'non_null': 3... | {'displayStyle': 'plain', 'align': 'left'} | NaN |
| 3 | 350402301 | PrimaryPropertyType | text | The primary use of a property (e.g. office, re... | primarypropertytype | 4 | text | 46302825 | 328.0 | {'largest': 'Worship Facility', 'non_null': 33... | {'displayStyle': 'plain', 'align': 'left'} | NaN |
| 4 | 350402302 | PropertyName | text | Official or common property name. | propertyname | 5 | text | 46302826 | 244.0 | {'largest': 'ZYMOGENETICS-DENDREON', 'non_null... | {'displayStyle': 'plain', 'align': 'left'} | NaN |
| 5 | 350402873 | TaxParcelIdentificationNumber | text | \t\nProperty King County PIN | taxparcelidentificationnumber | 6 | text | 46302828 | 448.0 | {'largest': '9835200050', 'non_null': 3338, 'n... | {'precisionStyle': 'standard', 'noCommas': 'tr... | NaN |
| 6 | 350402305 | Location | location | NaN | location | 7 | location | 46302864 | 384.0 | NaN | {'view': 'address', 'align': 'left'} | NaN |
| 7 | 350402306 | CouncilDistrictCode | number | Property City of Seattle council district. | councildistrictcode | 12 | number | 46302829 | 328.0 | {'largest': '7', 'non_null': 3340, 'average': ... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 8 | 350402307 | Neighborhood | text | NaN | neighborhood | 13 | text | 46302830 | 244.0 | {'largest': 'SOUTHWEST', 'non_null': 3340, 'nu... | {} | NaN |
| 9 | 350402308 | YearBuilt | number | Year in which a property was constructed or un... | yearbuilt | 14 | number | 46302831 | 208.0 | {'largest': '2015', 'non_null': 3340, 'average... | {'precisionStyle': 'standard', 'noCommas': 'tr... | NaN |
| 10 | 350402309 | NumberofBuildings | number | NaN | numberofbuildings | 15 | number | 46302832 | 304.0 | {'largest': '39', 'non_null': 3340, 'average':... | {} | NaN |
| 11 | 350402310 | NumberofFloors | number | NaN | numberoffloors | 16 | number | 46302833 | 268.0 | {'largest': '99', 'non_null': 3332, 'average':... | {} | NaN |
| 12 | 350402311 | PropertyGFATotal | number | Total building and parking gross floor area. | propertygfatotal | 17 | number | 46302834 | 292.0 | {'largest': '2200000', 'non_null': 3340, 'aver... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 13 | 350402312 | PropertyGFAParking | number | Total space in square feet of all types of par... | propertygfaparking | 18 | number | 46302835 | 316.0 | {'largest': '512608', 'non_null': 3340, 'avera... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 14 | 350402313 | PropertyGFABuilding(s) | number | Total floor space in square feet between the o... | propertygfabuilding_s | 19 | number | 46302836 | 364.0 | {'largest': '2200000', 'non_null': 3340, 'aver... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 15 | 350402314 | ListOfAllPropertyUseTypes | text | NaN | listofallpropertyusetypes | 20 | text | 46302837 | 400.0 | {'largest': 'Worship Facility', 'non_null': 32... | {} | NaN |
| 16 | 350402315 | LargestPropertyUseType | text | NaN | largestpropertyusetype | 21 | text | 46302838 | 364.0 | {'largest': 'Worship Facility', 'non_null': 32... | {} | NaN |
| 17 | 350402316 | LargestPropertyUseTypeGFA | number | NaN | largestpropertyusetypegfa | 22 | number | 46302839 | 400.0 | {'largest': '1719643', 'non_null': 3204, 'aver... | {} | NaN |
| 18 | 350402317 | SecondLargestPropertyUseType | text | NaN | secondlargestpropertyusetype | 23 | text | 46302840 | 436.0 | {'largest': 'Worship Facility', 'non_null': 15... | {} | NaN |
| 19 | 350402318 | SecondLargestPropertyUseTypeGFA | number | NaN | secondlargestpropertyusetypegfa | 24 | number | 46302841 | 472.0 | {'largest': '686750', 'non_null': 1559, 'avera... | {} | NaN |
| 20 | 350402319 | ThirdLargestPropertyUseType | text | NaN | thirdlargestpropertyusetype | 25 | text | 46302842 | 424.0 | {'largest': 'Worship Facility', 'non_null': 56... | {} | NaN |
| 21 | 350402320 | ThirdLargestPropertyUseTypeGFA | number | NaN | thirdlargestpropertyusetypegfa | 26 | number | 46302843 | 460.0 | {'largest': '303910', 'non_null': 560, 'averag... | {} | NaN |
| 22 | 350402321 | YearsENERGYSTARCertified | text | NaN | yearsenergystarcertified | 27 | text | 46302844 | 388.0 | {'largest': '2017, 2016, 2014, 2013, 2012, 201... | {} | NaN |
| 23 | 350402322 | ENERGYSTARScore | number | An EPA calculated 1-100 rating that assesses a... | energystarscore | 28 | number | 46302845 | 280.0 | {'largest': '100.00', 'non_null': 2560, 'avera... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 24 | 350402323 | SiteEUI(kBtu/sf) | number | Site Energy Use Intensity (EUI) is a property'... | siteeui_kbtu_sf | 29 | number | 46302846 | 292.0 | {'largest': '800.60', 'non_null': 3330, 'avera... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 25 | 350402324 | SiteEUIWN(kBtu/sf) | number | \t\nWeather Normalized (WN) Site Energy Use In... | siteeuiwn_kbtu_sf | 30 | number | 46302847 | 316.0 | {'largest': '800.60', 'non_null': 3330, 'avera... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 26 | 350402325 | SourceEUI(kBtu/sf) | number | Source Energy Use Intensity (EUI) is a propert... | sourceeui_kbtu_sf | 31 | number | 46302848 | 316.0 | {'largest': '2511.10', 'non_null': 3330, 'aver... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 27 | 350402326 | SourceEUIWN(kBtu/sf) | number | Weather Normalized (WN) Source Energy Use Inte... | sourceeuiwn_kbtu_sf | 32 | number | 46302849 | 340.0 | {'largest': '2511.00', 'non_null': 3330, 'aver... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 28 | 350402327 | SiteEnergyUse(kBtu) | number | \t\nThe annual amount of energy consumed by th... | siteenergyuse_kbtu | 33 | number | 46302850 | 328.0 | {'largest': '295812640', 'non_null': 3330, 'av... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 29 | 350402328 | SiteEnergyUseWN(kBtu) | number | NaN | siteenergyusewn_kbtu | 34 | number | 46302851 | 352.0 | {'largest': '297740960', 'non_null': 3330, 'av... | {} | NaN |
| 30 | 350402329 | SteamUse(kBtu) | number | The annual amount of district steam consumed b... | steamuse_kbtu | 35 | number | 46302852 | 268.0 | {'largest': '127869744', 'non_null': 3330, 'av... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 31 | 350402330 | Electricity(kWh) | number | \t\nThe annual amount of electricity consumed ... | electricity_kwh | 36 | number | 46302853 | 292.0 | {'largest': '83445048', 'non_null': 3330, 'ave... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 32 | 350402331 | Electricity(kBtu) | number | \t\nThe annual amount of electricity consumed ... | electricity_kbtu | 37 | number | 46302854 | 304.0 | {'largest': '284726322', 'non_null': 3330, 'av... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 33 | 350402332 | NaturalGas(therms) | number | The annual amount of utility-supplied natural ... | naturalgas_therms | 38 | number | 46302855 | 316.0 | {'largest': '1364484', 'non_null': 3330, 'aver... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 34 | 350402333 | NaturalGas(kBtu) | number | NaN | naturalgas_kbtu | 39 | number | 46302856 | 292.0 | {'largest': '136448438', 'non_null': 3330, 'av... | {} | NaN |
| 35 | 350402334 | OtherFuelUse(kBtu) | number | NaN | otherfueluse_kbtu | 40 | number | 46302857 | 316.0 | {'largest': '8269669', 'non_null': 3330, 'aver... | {} | NaN |
| 36 | 350402335 | GHGEmissions(MetricTonsCO2e) | number | The total amount of greenhouse gas emissions, ... | ghgemissions_metrictonsco2e | 41 | number | 46302858 | 436.0 | {'largest': '11824.89', 'non_null': 3330, 'ave... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 37 | 350402336 | GHGEmissionsIntensity(kgCO2e/ft2) | number | \t\nTotal Greenhouse Gas Emissions divided by ... | ghgemissionsintensity_kgco2e_ft2 | 42 | number | 46302859 | 496.0 | {'largest': '31.38', 'non_null': 3330, 'averag... | {'precisionStyle': 'standard', 'noCommas': 'fa... | NaN |
| 38 | 350402337 | DefaultData | text | NaN | defaultdata | 43 | text | 46302860 | 232.0 | {'largest': 'Yes', 'non_null': 3339, 'null': '... | {} | NaN |
| 39 | 350402338 | Comment | text | NaN | comment | 44 | text | 46302861 | 184.0 | {'largest': 'Under construction starting 6/201... | {} | NaN |
| 40 | 350402339 | ComplianceStatus | text | NaN | compliancestatus | 45 | text | 46302862 | 292.0 | {'largest': 'Not Compliant', 'non_null': 3340,... | {} | NaN |
| 41 | 350402340 | Outlier | text | NaN | outlier | 46 | text | 46302863 | 184.0 | {'largest': 'Low Outlier', 'non_null': 84, 'nu... | {} | NaN |
| 42 | 350404102 | 2010 Census Tracts | number | NaN | :@computed_region_2day_rhn5 | 47 | number | 62603791 | NaN | NaN | {} | {'source_columns': ['location'], 'type': 'geor... |
| 43 | 350404103 | Seattle Police Department Micro Community Poli... | number | NaN | :@computed_region_ru88_fbhk | 48 | number | 62603792 | NaN | NaN | {} | {'source_columns': ['location'], 'type': 'geor... |
| 44 | 350404104 | City Council Districts | number | NaN | :@computed_region_cyqu_gs94 | 49 | number | 62603793 | NaN | NaN | {} | {'source_columns': ['location'], 'type': 'geor... |
| 45 | 350404105 | SPD Beats | number | NaN | :@computed_region_kuhn_3gp2 | 50 | number | 62603794 | NaN | NaN | {} | {'source_columns': ['location'], 'type': 'geor... |
| 46 | 350404106 | Zip Codes | number | NaN | :@computed_region_q256_3sug | 51 | number | 62603795 | NaN | NaN | {} | {'source_columns': ['location'], 'type': 'geor... |
# Noms des variables année 2015
with pd.option_context('display.max_colwidth', None):
display(meta_data_2015[['name','description']])
| name | description | |
|---|---|---|
| 0 | OSEBuildingID | A unique identifier assigned to each property covered by the Seattle Benchmarking Ordinance for tracking and identification purposes. |
| 1 | DataYear | NaN |
| 2 | BuildingType | City of Seattle building type classification. |
| 3 | PrimaryPropertyType | The primary use of a property (e.g. office, retail store). Primary use is defined as a function that accounts for more than 50% of a property. This is the Property Type - EPA Calculated field from Portfolio Manager. |
| 4 | PropertyName | Official or common property name. |
| 5 | TaxParcelIdentificationNumber | \t\nProperty King County PIN |
| 6 | Location | NaN |
| 7 | CouncilDistrictCode | Property City of Seattle council district. |
| 8 | Neighborhood | NaN |
| 9 | YearBuilt | Year in which a property was constructed or underwent a complete renovation. |
| 10 | NumberofBuildings | NaN |
| 11 | NumberofFloors | NaN |
| 12 | PropertyGFATotal | Total building and parking gross floor area. |
| 13 | PropertyGFAParking | Total space in square feet of all types of parking (Fully Enclosed, Partially Enclosed, and Open). |
| 14 | PropertyGFABuilding(s) | Total floor space in square feet between the outside surfaces of a building’s enclosing walls. This includes all areas inside the building(s), such as tenant space, common areas, stairwells, basements, storage, etc. |
| 15 | ListOfAllPropertyUseTypes | NaN |
| 16 | LargestPropertyUseType | NaN |
| 17 | LargestPropertyUseTypeGFA | NaN |
| 18 | SecondLargestPropertyUseType | NaN |
| 19 | SecondLargestPropertyUseTypeGFA | NaN |
| 20 | ThirdLargestPropertyUseType | NaN |
| 21 | ThirdLargestPropertyUseTypeGFA | NaN |
| 22 | YearsENERGYSTARCertified | NaN |
| 23 | ENERGYSTARScore | An EPA calculated 1-100 rating that assesses a property’s overall energy performance, based on national data to control for differences among climate, building uses, and operations. A score of 50 represents the national median. |
| 24 | SiteEUI(kBtu/sf) | Site Energy Use Intensity (EUI) is a property's Site Energy Use divided by its gross floor area. Site Energy Use is the annual amount of all the energy consumed by the property on-site, as reported on utility bills. Site EUI is measured in thousands of British thermal units (kBtu) per square foot. |
| 25 | SiteEUIWN(kBtu/sf) | \t\nWeather Normalized (WN) Site Energy Use Intensity (EUI) is a property's WN Site Energy divided by its gross floor area (in square feet). WN Site Energy is the Site Energy Use the property would have consumed during 30-year average weather conditions. WN Site EUI is measured in measured in thousands of British thermal units (kBtu) per square foot. |
| 26 | SourceEUI(kBtu/sf) | Source Energy Use Intensity (EUI) is a property's Source Energy Use divided by its gross floor area. Source Energy Use is the annual energy used to operate the property, including losses from generation, transmission, & distribution. Source EUI is measured in thousands of British thermal units (kBtu) per square foot. |
| 27 | SourceEUIWN(kBtu/sf) | Weather Normalized (WN) Source Energy Use Intensity (EUI) is a property's WN Source Energy divided by its gross floor area. WN Source Energy is the Source Energy Use the property would have consumed during 30-year average weather conditions. WN Source EUI is measured in measured in thousands of British thermal units (kBtu) per square foot. |
| 28 | SiteEnergyUse(kBtu) | \t\nThe annual amount of energy consumed by the property from all sources of energy. |
| 29 | SiteEnergyUseWN(kBtu) | NaN |
| 30 | SteamUse(kBtu) | The annual amount of district steam consumed by the property on-site, measured in thousands of British thermal units (kBtu). |
| 31 | Electricity(kWh) | \t\nThe annual amount of electricity consumed by the property on-site, including electricity purchased from the grid and generated by onsite renewable systems, measured in kWh. |
| 32 | Electricity(kBtu) | \t\nThe annual amount of electricity consumed by the property on-site, including electricity purchased from the grid and generated by onsite renewable systems, measured in thousands of British thermal units (kBtu). |
| 33 | NaturalGas(therms) | The annual amount of utility-supplied natural gas consumed by the property, measured in therms. |
| 34 | NaturalGas(kBtu) | NaN |
| 35 | OtherFuelUse(kBtu) | NaN |
| 36 | GHGEmissions(MetricTonsCO2e) | The total amount of greenhouse gas emissions, including carbon dioxide, methane, and nitrous oxide gases released into the atmosphere as a result of energy consumption at the property, measured in metric tons of carbon dioxide equivalent. This calculation uses a GHG emissions factor from Seattle CIty Light's portfolio of generating resources. This uses Seattle City Light's 2015 emissions factor of 52.44 lbs CO2e/MWh. Enwave steam factor = 170.17 lbs CO2e/MMBtu. Gas factor sourced from EPA Portfolio Manager = 53.11 kg CO2e/MBtu. |
| 37 | GHGEmissionsIntensity(kgCO2e/ft2) | \t\nTotal Greenhouse Gas Emissions divided by property's gross floor area, measured in kilograms of carbon dioxide equivalent per square foot. This calculation uses a GHG emissions factor from Seattle City Light's portfolio of generating resources |
| 38 | DefaultData | NaN |
| 39 | Comment | NaN |
| 40 | ComplianceStatus | NaN |
| 41 | Outlier | NaN |
| 42 | 2010 Census Tracts | NaN |
| 43 | Seattle Police Department Micro Community Policing Plan Areas | NaN |
| 44 | City Council Districts | NaN |
| 45 | SPD Beats | NaN |
| 46 | Zip Codes | NaN |
# Suppression des doublons
Buildings2015.drop_duplicates(inplace=True)
Buildings2015.shape
(3340, 47)
Relevés de 2016
Buildings2016=pd.read_csv('2016-building-energy-benchmarking.csv')
meta_2016 = pd.read_json('socrata_metadata_2016-building-energy-benchmarking.json',
orient='index')
# Affichage du chargement
display(Buildings2016.shape,Buildings2016.head(2))
meta_2016
(3376, 46)
| OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | Address | City | State | ZipCode | TaxParcelIdentificationNumber | CouncilDistrictCode | Neighborhood | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEUI(kBtu/sf) | SiteEUIWN(kBtu/sf) | SourceEUI(kBtu/sf) | SourceEUIWN(kBtu/sf) | SiteEnergyUse(kBtu) | SiteEnergyUseWN(kBtu) | SteamUse(kBtu) | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | DefaultData | Comments | ComplianceStatus | Outlier | TotalGHGEmissions | GHGEmissionsIntensity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2016 | NonResidential | Hotel | Mayflower park hotel | 405 Olive way | Seattle | WA | 98101.0 | 0659000030 | 7 | DOWNTOWN | 47.61220 | -122.33799 | 1927 | 1.0 | 12 | 88434 | 0 | 88434 | Hotel | Hotel | 88434.0 | NaN | NaN | NaN | NaN | NaN | 60.0 | 81.699997 | 84.300003 | 182.500000 | 189.000000 | 7226362.5 | 7456910.0 | 2003882.0 | 1.156514e+06 | 3946027.0 | 12764.52930 | 1276453.0 | False | NaN | Compliant | NaN | 249.98 | 2.83 |
| 1 | 2 | 2016 | NonResidential | Hotel | Paramount Hotel | 724 Pine street | Seattle | WA | 98101.0 | 0659000220 | 7 | DOWNTOWN | 47.61317 | -122.33393 | 1996 | 1.0 | 11 | 103566 | 15064 | 88502 | Hotel, Parking, Restaurant | Hotel | 83880.0 | Parking | 15064.0 | Restaurant | 4622.0 | NaN | 61.0 | 94.800003 | 97.900002 | 176.100006 | 179.399994 | 8387933.0 | 8664479.0 | 0.0 | 9.504252e+05 | 3242851.0 | 51450.81641 | 5145082.0 | False | NaN | Compliant | NaN | 295.86 | 2.86 |
| 0 | |
|---|---|
| id | 2bpz-gwpy |
| name | 2016 Building Energy Benchmarking |
| assetType | dataset |
| attribution | City of Seattle |
| attributionLink | https://www.seattle.gov/energybenchmarking |
| averageRating | 0 |
| createdAt | 1521070051 |
| description | Seattle’s Building Energy Benchmarking and Rep... |
| displayType | table |
| downloadCount | 2024 |
| hideFromCatalog | False |
| hideFromDataJson | False |
| indexUpdatedAt | 1521490072 |
| licenseId | PUBLIC_DOMAIN |
| newBackend | True |
| numberOfComments | 0 |
| oid | 28218043 |
| provenance | official |
| publicationAppendEnabled | False |
| publicationDate | 1521071079 |
| publicationGroup | 14936543 |
| publicationStage | published |
| rowIdentifierColumnId | 349768636 |
| rowsUpdatedAt | 1521071078 |
| rowsUpdatedBy | 9yyf-ecsx |
| tableId | 14936543 |
| totalTimesRated | 0 |
| viewCount | 2234 |
| viewLastModified | 1557954938 |
| viewType | tabular |
| approvals | [{'reviewedAt': 1521071079, 'reviewedAutomatic... |
| columns | [{'id': 349768636, 'name': 'OSEBuildingID', 'd... |
| grants | [{'inherited': False, 'type': 'viewer', 'flags... |
| license | {'name': 'Public Domain'} |
| metadata | {'sidebar': {'width': 482}, 'custom_fields': {... |
| owner | {'id': '9yyf-ecsx', 'displayName': 'Sullivan, ... |
| query | {} |
| rights | [read] |
| tableAuthor | {'id': '9yyf-ecsx', 'displayName': 'Sullivan, ... |
| tags | [building, eui, energy, steam, electricity, gr... |
| flags | [default, restorable, restorePossibleForType] |
# Création d'un dataframe contenant les métadonnées
meta_data_2016=pd.DataFrame(meta_2016.T['columns'][0])
meta_data_2016
| id | name | dataTypeName | description | fieldName | position | renderTypeName | tableColumnId | cachedContents | format | width | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 349768636 | OSEBuildingID | number | A unique identifier assigned to each property ... | osebuildingid | 1 | number | 62418341 | {'largest': '50226', 'non_null': 3376, 'averag... | {'groupSeparator': ''} | NaN |
| 1 | 349768619 | DataYear | number | Calendar year (January-December) represented b... | datayear | 2 | number | 62418324 | {'largest': '2016', 'non_null': 3376, 'average... | {'precisionStyle': 'standard', 'noCommas': 'tr... | 100.0 |
| 2 | 349768631 | BuildingType | text | City of Seattle building type classification. | buildingtype | 3 | text | 62418336 | {'largest': 'SPS-District K-12', 'non_null': 3... | {} | NaN |
| 3 | 349768622 | PrimaryPropertyType | text | The primary use of a property (e.g. office, re... | primarypropertytype | 4 | text | 62418327 | {'largest': 'Worship Facility', 'non_null': 33... | {} | NaN |
| 4 | 349768617 | PropertyName | text | Official or common property name as entered in... | propertyname | 5 | text | 62418322 | {'largest': 'Zindorf Apartments', 'non_null': ... | {} | NaN |
| 5 | 349768592 | Address | text | Property street address | address | 6 | text | 62418297 | {'largest': 'Readiness Center - Pier 91', 'non... | {} | NaN |
| 6 | 349768594 | City | text | Property city | city | 7 | text | 62418299 | {'largest': 'Seattle', 'non_null': 3376, 'null... | {} | NaN |
| 7 | 349768593 | State | text | Property state | state | 8 | text | 62418298 | {'largest': 'WA', 'non_null': 3376, 'null': 0,... | {} | NaN |
| 8 | 349768610 | ZipCode | number | Property zip | zipcode | 9 | number | 62418315 | {'largest': '98272', 'non_null': 3360, 'averag... | {'precisionStyle': 'standard', 'noCommas': 'tr... | 100.0 |
| 9 | 349768606 | TaxParcelIdentificationNumber | text | Property King County PIN | taxparcelidentificationnumber | 10 | text | 62418311 | {'largest': '9835200050', 'non_null': 3376, 'n... | {} | NaN |
| 10 | 349768629 | CouncilDistrictCode | number | Property City of Seattle council district. | councildistrictcode | 11 | number | 62418334 | {'largest': '7', 'non_null': 3376, 'average': ... | {} | NaN |
| 11 | 349768627 | Neighborhood | text | Property neighborhood area defined by the City... | neighborhood | 12 | text | 62418332 | {'largest': 'SOUTHWEST', 'non_null': 3376, 'nu... | {} | NaN |
| 12 | 349768591 | Latitude | number | Property latitude. | latitude | 13 | number | 62418296 | {'largest': '47.73387', 'non_null': 3376, 'ave... | {} | NaN |
| 13 | 349768598 | Longitude | number | Property longitude. | longitude | 14 | number | 62418303 | {'largest': '-122.2209659', 'non_null': 3376, ... | {} | NaN |
| 14 | 349768623 | YearBuilt | number | Year in which a property was constructed or un... | yearbuilt | 15 | number | 62418328 | {'largest': '2015', 'non_null': 3376, 'average... | {'precisionStyle': 'standard', 'noCommas': 'tr... | 100.0 |
| 15 | 349768607 | NumberofBuildings | number | Number of buildings included in the property's... | numberofbuildings | 16 | number | 62418312 | {'largest': '111', 'non_null': 3368, 'average'... | {} | NaN |
| 16 | 349768605 | NumberofFloors | number | Number of floors reported in Portfolio Manager | numberoffloors | 17 | number | 62418310 | {'largest': '99', 'non_null': 3376, 'average':... | {} | NaN |
| 17 | 349768602 | PropertyGFATotal | number | Total building and parking gross floor area. | propertygfatotal | 18 | number | 62418307 | {'largest': '9320156', 'non_null': 3376, 'aver... | {} | NaN |
| 18 | 349768616 | PropertyGFAParking | number | Total space in square feet of all types of par... | propertygfaparking | 19 | number | 62418321 | {'largest': '512608', 'non_null': 3376, 'avera... | {} | NaN |
| 19 | 349768635 | PropertyGFABuilding(s) | number | Total floor space in square feet between the o... | propertygfabuilding_s | 20 | number | 62418340 | {'largest': '9320156', 'non_null': 3376, 'aver... | {} | NaN |
| 20 | 349768595 | ListOfAllPropertyUseTypes | text | All property uses reported in Portfolio Manager | listofallpropertyusetypes | 21 | text | 62418300 | {'largest': 'Worship Facility', 'non_null': 33... | {} | NaN |
| 21 | 349768634 | LargestPropertyUseType | text | The largest use of a property (e.g. office, re... | largestpropertyusetype | 22 | text | 62418339 | {'largest': 'Worship Facility', 'non_null': 33... | {} | NaN |
| 22 | 349768612 | LargestPropertyUseTypeGFA | number | The gross floor area (GFA) of the largest use ... | largestpropertyusetypegfa | 23 | number | 62418317 | {'largest': '9320156', 'non_null': 3356, 'aver... | {} | NaN |
| 23 | 349768630 | SecondLargestPropertyUseType | text | The second largest use of a property (e.g. off... | secondlargestpropertyusetype | 24 | text | 62418335 | {'largest': 'Worship Facility', 'non_null': 16... | {} | NaN |
| 24 | 349768614 | SecondLargestPropertyUseTypeGFA | number | The gross floor area (GFA) of the second large... | secondlargestpropertyuse | 25 | number | 62418319 | {'largest': '686750', 'non_null': 1679, 'avera... | {} | NaN |
| 25 | 349768620 | ThirdLargestPropertyUseType | text | The third largest use of a property (e.g. offi... | thirdlargestpropertyusetype | 26 | text | 62418325 | {'largest': 'Worship Facility', 'non_null': 60... | {} | NaN |
| 26 | 349768625 | ThirdLargestPropertyUseTypeGFA | number | The gross floor area (GFA) of the third larges... | thirdlargestpropertyusetypegfa | 27 | number | 62418330 | {'largest': '459748', 'non_null': 596, 'averag... | {} | NaN |
| 27 | 349768600 | YearsENERGYSTARCertified | number | Years the property has received ENERGY STAR ce... | yearsenergystarcertified | 28 | number | 62418305 | {'largest': '201620152014201320122011201020092... | {} | NaN |
| 28 | 349768596 | ENERGYSTARScore | text | An EPA calculated 1-100 rating that assesses a... | energystarscore | 29 | text | 62418301 | {'largest': 'NULL', 'non_null': 2541, 'null': ... | {} | NaN |
| 29 | 349768609 | SiteEUI(kBtu/sf) | number | Site Energy Use Intensity (EUI) is a property'... | siteeui_kbtu_sf | 30 | number | 62418314 | {'largest': '834.4000244', 'non_null': 3369, '... | {} | NaN |
| 30 | 349768611 | SiteEUIWN(kBtu/sf) | number | Weather Normalized (WN) Site Energy Use Intens... | siteeuiwn_kbtu_sf | 31 | number | 62418316 | {'largest': '834.4000244', 'non_null': 3370, '... | {} | NaN |
| 31 | 349768633 | SourceEUI(kBtu/sf) | number | Source Energy Use Intensity (EUI) is a propert... | sourceeui_kbtu_sf | 32 | number | 62418338 | {'largest': '2620', 'non_null': 3367, 'average... | {} | NaN |
| 32 | 349768599 | SourceEUIWN(kBtu/sf) | number | Weather Normalized (WN) Source Energy Use Inte... | sourceeuiwn_kbtu_sf | 33 | number | 62418304 | {'largest': '2620', 'non_null': 3367, 'average... | {} | NaN |
| 33 | 349768628 | SiteEnergyUse(kBtu) | number | The annual amount of energy consumed by the pr... | siteenergyuse_kbtu | 34 | number | 62418333 | {'largest': '873923712', 'non_null': 3371, 'av... | {} | NaN |
| 34 | 349768603 | SiteEnergyUseWN(kBtu) | number | The annual amount of energy consumed by the pr... | siteenergyusewn_kbtu | 35 | number | 62418308 | {'largest': '471613856', 'non_null': 3370, 'av... | {} | NaN |
| 35 | 349768626 | SteamUse(kBtu) | number | The annual amount of district steam consumed b... | steamuse_kbtu | 36 | number | 62418331 | {'largest': '134943456', 'non_null': 3367, 'av... | {} | NaN |
| 36 | 349768613 | Electricity(kWh) | number | The annual amount of electricity consumed by t... | electricity_kwh | 37 | number | 62418318 | {'largest': '192577488', 'non_null': 3367, 'av... | {} | NaN |
| 37 | 349768624 | Electricity(kBtu) | number | The annual amount of electricity consumed by t... | electricity_kbtu | 38 | number | 62418329 | {'largest': '657074389', 'non_null': 3367, 'av... | {} | NaN |
| 38 | 349768621 | NaturalGas(therms) | number | The annual amount of utility-supplied natural ... | naturalgas_therms | 39 | number | 62418326 | {'largest': '2979090', 'non_null': 3367, 'aver... | {} | NaN |
| 39 | 349768608 | NaturalGas(kBtu) | number | The annual amount of utility-supplied natural ... | naturalgas_kbtu | 40 | number | 62418313 | {'largest': '297909000', 'non_null': 3367, 'av... | {} | NaN |
| 40 | 349768615 | DefaultData | checkbox | The property used default data for at least on... | defaultdata | 41 | checkbox | 62418320 | {'largest': True, 'non_null': 3376, 'null': 0,... | {} | NaN |
| 41 | 349768597 | Comments | text | Comments by a building owner or agent to provi... | comments | 42 | text | 62418302 | {'non_null': 0, 'null': 3376} | {} | NaN |
| 42 | 349768618 | ComplianceStatus | text | Whether a property has met energy benchmarking... | compliancestatus | 43 | text | 62418323 | {'largest': 'Non-Compliant', 'non_null': 3376,... | {} | NaN |
| 43 | 349768601 | Outlier | text | Whether a property is a high or low outlier (Y/N) | outlier | 44 | text | 62418306 | {'largest': 'Low outlier', 'non_null': 32, 'nu... | {} | NaN |
| 44 | 349768604 | TotalGHGEmissions | text | The total amount of greenhouse gas emissions, ... | totalghgemissions | 45 | text | 62418309 | {'largest': '99.65', 'non_null': 3367, 'null':... | {'displayStyle': 'plain', 'align': 'left'} | 100.0 |
| 45 | 349768632 | GHGEmissionsIntensity | text | Total Greenhouse Gas Emissions divided by prop... | ghgemissionsintensity | 46 | text | 62418337 | {'largest': '9.98', 'non_null': 3367, 'null': ... | {} | NaN |
# Noms des variables année 2016
with pd.option_context('display.max_colwidth', None):
display(meta_data_2016[['name','description']])
| name | description | |
|---|---|---|
| 0 | OSEBuildingID | A unique identifier assigned to each property covered by the Seattle Benchmarking Ordinance for tracking and identification purposes. |
| 1 | DataYear | Calendar year (January-December) represented by each data record. |
| 2 | BuildingType | City of Seattle building type classification. |
| 3 | PrimaryPropertyType | The primary use of a property (e.g. office, retail store). Primary use is defined as a function that accounts for more than 50% of a property. This is the Property Type - EPA Calculated field from Portfolio Manager. |
| 4 | PropertyName | Official or common property name as entered in EPA’s Portfolio Manager. |
| 5 | Address | Property street address |
| 6 | City | Property city |
| 7 | State | Property state |
| 8 | ZipCode | Property zip |
| 9 | TaxParcelIdentificationNumber | Property King County PIN |
| 10 | CouncilDistrictCode | Property City of Seattle council district. |
| 11 | Neighborhood | Property neighborhood area defined by the City of Seattle Department of Neighborhoods. |
| 12 | Latitude | Property latitude. |
| 13 | Longitude | Property longitude. |
| 14 | YearBuilt | Year in which a property was constructed or underwent a complete renovation. |
| 15 | NumberofBuildings | Number of buildings included in the property's report. In cases where a property is reporting as a campus, multiple buildings may be included in one report. |
| 16 | NumberofFloors | Number of floors reported in Portfolio Manager |
| 17 | PropertyGFATotal | Total building and parking gross floor area. |
| 18 | PropertyGFAParking | Total space in square feet of all types of parking (Fully Enclosed, Partially Enclosed, and Open). |
| 19 | PropertyGFABuilding(s) | Total floor space in square feet between the outside surfaces of a building’s enclosing walls. This includes all areas inside the building(s), such as tenant space, common areas, stairwells, basements, storage, etc. |
| 20 | ListOfAllPropertyUseTypes | All property uses reported in Portfolio Manager |
| 21 | LargestPropertyUseType | The largest use of a property (e.g. office, retail store) by GFA. |
| 22 | LargestPropertyUseTypeGFA | The gross floor area (GFA) of the largest use of the property. |
| 23 | SecondLargestPropertyUseType | The second largest use of a property (e.g. office, retail store) by GFA. |
| 24 | SecondLargestPropertyUseTypeGFA | The gross floor area (GFA) of the second largest use of the property. |
| 25 | ThirdLargestPropertyUseType | The third largest use of a property (e.g. office, retail store) by GFA. |
| 26 | ThirdLargestPropertyUseTypeGFA | The gross floor area (GFA) of the third largest use of the property. |
| 27 | YearsENERGYSTARCertified | Years the property has received ENERGY STAR certification. |
| 28 | ENERGYSTARScore | An EPA calculated 1-100 rating that assesses a property’s overall energy performance, based on national data to control for differences among climate, building uses, and operations. A score of 50 represents the national median. |
| 29 | SiteEUI(kBtu/sf) | Site Energy Use Intensity (EUI) is a property's Site Energy Use divided by its gross floor area. Site Energy Use is the annual amount of all the energy consumed by the property on-site, as reported on utility bills. Site EUI is measured in thousands of British thermal units (kBtu) per square foot. |
| 30 | SiteEUIWN(kBtu/sf) | Weather Normalized (WN) Site Energy Use Intensity (EUI) is a property's WN Site Energy divided by its gross floor area (in square feet). WN Site Energy is the Site Energy Use the property would have consumed during 30-year average weather conditions. WN Site EUI is measured in measured in thousands of British thermal units (kBtu) per square foot. |
| 31 | SourceEUI(kBtu/sf) | Source Energy Use Intensity (EUI) is a property's Source Energy Use divided by its gross floor area. Source Energy Use is the annual energy used to operate the property, including losses from generation, transmission, & distribution. Source EUI is measured in thousands of British thermal units (kBtu) per square foot. |
| 32 | SourceEUIWN(kBtu/sf) | Weather Normalized (WN) Source Energy Use Intensity (EUI) is a property's WN Source Energy divided by its gross floor area. WN Source Energy is the Source Energy Use the property would have consumed during 30-year average weather conditions. WN Source EUI is measured in measured in thousands of British thermal units (kBtu) per square foot. |
| 33 | SiteEnergyUse(kBtu) | The annual amount of energy consumed by the property from all sources of energy. |
| 34 | SiteEnergyUseWN(kBtu) | The annual amount of energy consumed by the property from all sources of energy, adjusted to what the property would have consumed during 30-year average weather conditions. |
| 35 | SteamUse(kBtu) | The annual amount of district steam consumed by the property on-site, measured in thousands of British thermal units (kBtu). |
| 36 | Electricity(kWh) | The annual amount of electricity consumed by the property on-site, including electricity purchased from the grid and generated by onsite renewable systems, measured in kWh. |
| 37 | Electricity(kBtu) | The annual amount of electricity consumed by the property on-site, including electricity purchased from the grid and generated by onsite renewable systems, measured in thousands of British thermal units (kBtu). |
| 38 | NaturalGas(therms) | The annual amount of utility-supplied natural gas consumed by the property, measured in therms. |
| 39 | NaturalGas(kBtu) | The annual amount of utility-supplied natural gas consumed by the property, measured in thousands of British thermal units (kBtu). |
| 40 | DefaultData | The property used default data for at least one property characteristic. |
| 41 | Comments | Comments by a building owner or agent to provide context to the building’s energy use. |
| 42 | ComplianceStatus | Whether a property has met energy benchmarking requirements for the current reporting year. |
| 43 | Outlier | Whether a property is a high or low outlier (Y/N) |
| 44 | TotalGHGEmissions | The total amount of greenhouse gas emissions, including carbon dioxide, methane, and nitrous oxide gases released into the atmosphere as a result of energy consumption at the property, measured in metric tons of carbon dioxide equivalent. This calculation uses a GHG emissions factor from Seattle CIty Light's portfolio of generating resources. This uses Seattle City Light's 2015 emissions factor of 52.44 lbs CO2e/MWh until the 2016 factor is available. Enwave steam factor = 170.17 lbs CO2e/MMBtu. Gas factor sourced from EPA Portfolio Manager = 53.11 kg CO2e/MBtu. |
| 45 | GHGEmissionsIntensity | Total Greenhouse Gas Emissions divided by property's gross floor area, measured in kilograms of carbon dioxide equivalent per square foot. This calculation uses a GHG emissions factor from Seattle City Light's portfolio of generating resources |
Buildings2016.drop_duplicates(inplace=True)
Buildings2016.shape
(3376, 46)
Différences entre les deux datasets
col_2015=meta_data_2015['name'].tolist()
col_2016=meta_data_2016['name'].tolist()
col_2015_notin_2016 = [name for name in col_2015 if name not in col_2016]
print(' Colonnes 2015 absentes de 2016 :\n',col_2015_notin_2016,'\n')
col_2016_notin_2015 = [name for name in col_2016 if name not in col_2015]
print(' Colonnes 2016 absentes de 2015 :\n',col_2016_notin_2015)
Colonnes 2015 absentes de 2016 : ['Location', 'OtherFuelUse(kBtu)', 'GHGEmissions(MetricTonsCO2e)', 'GHGEmissionsIntensity(kgCO2e/ft2)', 'Comment', '2010 Census Tracts', 'Seattle Police Department Micro Community Policing Plan Areas', 'City Council Districts', 'SPD Beats', 'Zip Codes'] Colonnes 2016 absentes de 2015 : ['Address', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude', 'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity']
Aprés lecture des informations sur les variables
Buildings2015.rename(columns={'GHGEmissions(MetricTonsCO2e)': 'TotalGHGEmissions',
'GHGEmissionsIntensity(kgCO2e/ft2)': 'GHGEmissionsIntensity'
}, inplace=True)
col_2015=Buildings2015.columns.to_list()
col_2016=Buildings2016.columns.to_list()
col_2015_notin_2016 = [name for name in col_2015 if name not in col_2016]
print('Colonnes 2015 absentes de 2016 :\n',col_2015_notin_2016,'\n')
col_2016_notin_2015 = [name for name in col_2016 if name not in col_2015]
print(' Colonnes 2016 absentes de 2015 :\n',col_2016_notin_2015)
Colonnes 2015 absentes de 2016 : ['Location', 'OtherFuelUse(kBtu)', 'Comment', '2010 Census Tracts', 'Seattle Police Department Micro Community Policing Plan Areas', 'City Council Districts', 'SPD Beats', 'Zip Codes'] Colonnes 2016 absentes de 2015 : ['Address', 'City', 'State', 'ZipCode', 'Latitude', 'Longitude', 'Comments']
Suppression des colonnes ne correspondant pas car soit elles contiennent des informations de localisation précises sur les batiments et risquent de mener le modèle à apprendre 'par coeur' les consommations demandées, soit des informations non pertinentes.
Buildings2015.drop(col_2015_notin_2016, axis=1, inplace=True)
Buildings2016.drop(col_2016_notin_2015, axis=1, inplace=True)
#Suppression de TaxParcelIdentificationNumber pour le mêmes raisons
Buildings2015.drop('TaxParcelIdentificationNumber', axis=1, inplace=True)
Buildings2016.drop('TaxParcelIdentificationNumber', axis=1, inplace=True)
Concaténation des deux dataFrames
data=pd.concat([Buildings2015,Buildings2016],ignore_index=True)
data.shape
(6716, 38)
msno.matrix(data,color=bleu)
<AxesSubplot:>
Dédoublonnage
# Formatagge des noms des propriétés
data['PropertyName']=data['PropertyName'].apply(lambda s : s.upper())
data['PropertyName'].duplicated(keep=False).sum()
1844
data['OSEBuildingID'].duplicated(keep=False).sum()
6568
Je choisis de garder les relevés 2016 pour les bâtiments apparaissant sur les deux années et les bâtiments de 2015 n'ayant pas fait l'objet de relevés en 2016.
# filtrage sur les noms des propriétés
data.drop_duplicates(subset='OSEBuildingID',keep='last',inplace=True)
data.shape
(3432, 38)
# Filtrage sur les variables des batiments restants
data.duplicated(keep=False).sum()
0
data.drop('OSEBuildingID',axis=1,inplace=True)
Pas de doublons
msno.matrix(data,color=bleu)
<AxesSubplot:>
ENERGY USE :
TotalGHGEmissions :
Portfolio Manager calculates your building's greenhouse gas emissions (including carbon dioxide, methane, and nitrous oxide) from on-site fuel combustion and purchased electricity and district heating and cooling. Portfolio Manager also enables tracking of avoided emissions from any green power purchases.
The methodology for calculating greenhouse gas emissions in Portfolio Manager was designed to be consistent with the Greenhouse Gas Protocol developed by the World Resources Institute and World Business Council for Sustainable Development, and is compatible with the accounting, inventory and reporting requirements of the Center for Corporate Climate Leadership as well as other state and NGO registry and reporting programs.
# Exclusion des sujets pour lesquels on ne connait pas les cibles
data=data.dropna(subset=['SiteEnergyUse(kBtu)','TotalGHGEmissions'],axis=0)
print('Reste',data.shape[0],'batiments pour lesquels on connaît ENERGY USE et TotalGHGEmissions ainsi que',data.shape[1]-1, 'autres indicateurs.')
Reste 3422 batiments pour lesquels on connaît ENERGY USE et TotalGHGEmissions ainsi que 36 autres indicateurs.
msno.matrix(data,color=bleu)
<AxesSubplot:>
exclues=['SiteEUI(kBtu/sf)',
'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)',
'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)',
'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)',
'GHGEmissionsIntensity']
data.drop(exclues,axis=1,inplace=True)
msno.matrix(data,color=bleu)
print('Reste',data.shape[0],'batiments pour lesquels on connaît ENERGY USE et TotalGHGEmissions ainsi que',data.shape[1]-1, 'autres indicateurs.')
Reste 3422 batiments pour lesquels on connaît ENERGY USE et TotalGHGEmissions ainsi que 25 autres indicateurs.
data.describe(include='all')
| DataYear | BuildingType | PrimaryPropertyType | PropertyName | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEnergyUse(kBtu) | TotalGHGEmissions | DefaultData | ComplianceStatus | Outlier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3422.000000 | 3422 | 3422 | 3422 | 3422.00000 | 3422 | 3422.000000 | 3422.000000 | 3422.000000 | 3.422000e+03 | 3422.000000 | 3.422000e+03 | 3406 | 3395 | 3.395000e+03 | 1698 | 1698.000000 | 602 | 602.000000 | 120 | 2574.000000 | 3.422000e+03 | 3422.000000 | 3422 | 3422 | 35 |
| unique | NaN | 8 | 26 | 3405 | NaN | 19 | NaN | NaN | NaN | NaN | NaN | NaN | 472 | 57 | NaN | 50 | NaN | 44 | NaN | 65 | NaN | NaN | NaN | 4 | 4 | 3 |
| top | NaN | NonResidential | Low-Rise Multifamily | NORTHGATE PLAZA | NaN | DOWNTOWN | NaN | NaN | NaN | NaN | NaN | NaN | Multifamily Housing | Multifamily Housing | NaN | Parking | NaN | Retail Store | NaN | 2016 | NaN | NaN | NaN | False | Compliant | Low outlier |
| freq | NaN | 1486 | 1002 | 3 | NaN | 585 | NaN | NaN | NaN | NaN | NaN | NaN | 874 | 1683 | NaN | 988 | NaN | 112 | NaN | 14 | NaN | NaN | NaN | 3254 | 3262 | 23 |
| mean | 2015.983928 | NaN | NaN | NaN | 4.44886 | NaN | 1968.364991 | 1.106955 | 4.692285 | 9.474116e+04 | 8185.718878 | 8.655545e+04 | NaN | NaN | 7.923501e+04 | NaN | 28482.410658 | NaN | 11694.299666 | NaN | 68.004662 | 5.424472e+06 | 120.539719 | NaN | NaN | NaN |
| std | 0.125773 | NaN | NaN | NaN | 2.11967 | NaN | 33.199569 | 2.094163 | 5.465658 | 2.179351e+05 | 33075.053259 | 2.067303e+05 | NaN | NaN | 2.011440e+05 | NaN | 54431.218483 | NaN | 29195.448352 | NaN | 26.795406 | 2.162899e+07 | 541.496716 | NaN | NaN | NaN |
| min | 2015.000000 | NaN | NaN | NaN | 1.00000 | NaN | 1900.000000 | 0.000000 | 0.000000 | 1.128500e+04 | 0.000000 | 3.636000e+03 | NaN | NaN | 5.656000e+03 | NaN | 0.000000 | NaN | 0.000000 | NaN | 1.000000 | 0.000000e+00 | -0.800000 | NaN | NaN | NaN |
| 25% | 2016.000000 | NaN | NaN | NaN | 3.00000 | NaN | 1948.000000 | 1.000000 | 2.000000 | 2.843200e+04 | 0.000000 | 2.764050e+04 | NaN | NaN | 2.505400e+04 | NaN | 5000.000000 | NaN | 2241.750000 | NaN | 53.000000 | 9.242540e+05 | 9.422500 | NaN | NaN | NaN |
| 50% | 2016.000000 | NaN | NaN | NaN | 4.00000 | NaN | 1974.000000 | 1.000000 | 4.000000 | 4.417500e+04 | 0.000000 | 4.319150e+04 | NaN | NaN | 3.987000e+04 | NaN | 10664.000000 | NaN | 5043.000000 | NaN | 75.000000 | 1.797181e+06 | 33.790000 | NaN | NaN | NaN |
| 75% | 2016.000000 | NaN | NaN | NaN | 7.00000 | NaN | 1997.000000 | 1.000000 | 5.000000 | 9.085975e+04 | 0.000000 | 8.414475e+04 | NaN | NaN | 7.571750e+04 | NaN | 26835.000000 | NaN | 10226.250000 | NaN | 90.000000 | 4.219434e+06 | 93.975000 | NaN | NaN | NaN |
| max | 2016.000000 | NaN | NaN | NaN | 7.00000 | NaN | 2015.000000 | 111.000000 | 99.000000 | 9.320156e+06 | 512608.000000 | 9.320156e+06 | NaN | NaN | 9.320156e+06 | NaN | 686750.000000 | NaN | 459748.000000 | NaN | 100.000000 | 8.739237e+08 | 16870.980000 | NaN | NaN | NaN |
Remarques sur les valeurs relevant des erreurs de remplissage
data[data['NumberofFloors'] <=0]
| DataYear | BuildingType | PrimaryPropertyType | PropertyName | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEnergyUse(kBtu) | TotalGHGEmissions | DefaultData | ComplianceStatus | Outlier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3506 | 2016 | NonResidential | Hotel | GRAND HYATT SEATTLE | 7 | DOWNTOWN | 1999 | 1.0 | 0.0 | 934292 | 0 | 934292 | Hotel, Other, Parking, Restaurant, Retail Store | Hotel | 495536.0 | Parking | 438756.0 | Restaurant | 49700.0 | NaN | NaN | 6.504728e+07 | 1638.46 | False | Compliant | NaN |
| 3827 | 2016 | NonResidential | Medical Office | ARNOLD PAVILION | 3 | EAST | 2004 | 1.0 | 0.0 | 225982 | 0 | 225982 | Medical Office | Medical Office | 200184.0 | NaN | NaN | NaN | NaN | NaN | 53.0 | 2.056062e+07 | 692.12 | False | Compliant | NaN |
| 3828 | 2016 | NonResidential | Mixed Use Property | 2200 WESTLAKE - SEDO | 7 | DOWNTOWN | 2006 | 0.0 | 0.0 | 516407 | 0 | 516407 | Financial Office, Hotel, Multifamily Housing, ... | Parking | 385196.0 | Multifamily Housing | 340236.0 | Hotel | 67627.0 | NaN | 31.0 | 3.940332e+07 | 734.72 | False | Compliant | NaN |
| 3904 | 2016 | NonResidential | Other | PACIFIC PLACE | 7 | DOWNTOWN | 1999 | 1.0 | 0.0 | 947987 | 0 | 947987 | Office, Other - Entertainment/Public Assembly,... | Other - Mall | 561684.0 | Other - Entertainment/Public Assembly | 100000.0 | Restaurant | 80000.0 | NaN | NaN | 4.651096e+07 | 324.25 | False | Compliant | NaN |
| 5094 | 2016 | NonResidential | Medical Office | HART FIRST HILL LLC | 3 | EAST | 1945 | 1.0 | 0.0 | 274568 | 0 | 274568 | Medical Office, Parking | Medical Office | 228133.0 | Parking | 135954.0 | NaN | NaN | NaN | 52.0 | 2.531153e+07 | 1110.07 | False | Compliant | NaN |
| 5333 | 2016 | Campus | Other | (ID#24086)CAMPUS1:KC METRO TRANSIT ATLANTIC CE... | 2 | GREATER DUWAMISH | 1991 | 10.0 | 0.0 | 230971 | 0 | 230971 | Other, Parking | Other | 230970.0 | Parking | 0.0 | NaN | NaN | NaN | NaN | 2.102229e+07 | 405.05 | False | Compliant | NaN |
| 6470 | 2016 | NonResidential | Warehouse | SANDPOINT #5 | 4 | NORTHEAST | 1940 | 1.0 | 0.0 | 384772 | 0 | 384772 | Non-Refrigerated Warehouse, Office | Non-Refrigerated Warehouse | 349953.0 | Office | 63872.0 | NaN | NaN | NaN | 69.0 | 1.520676e+07 | 391.70 | False | Compliant | NaN |
| 6471 | 2016 | NonResidential | Medical Office | SANDPOINT #25 | 4 | NORTHEAST | 1960 | 1.0 | 0.0 | 30287 | 0 | 30287 | Medical Office | Medical Office | 28050.0 | NaN | NaN | NaN | NaN | NaN | 32.0 | 2.193115e+06 | 32.72 | False | Compliant | NaN |
| 6472 | 2016 | NonResidential | Small- and Mid-Sized Office | SANDPOINT #29 | 4 | NORTHEAST | 1960 | 1.0 | 0.0 | 21931 | 0 | 21931 | Office | Office | 31845.0 | Office | 31845.0 | NaN | NaN | NaN | 32.0 | 3.947209e+06 | 145.11 | False | Compliant | NaN |
| 6508 | 2016 | Nonresidential COS | Other | MAGNUSON | 4 | NORTHEAST | 1929 | 8.0 | 0.0 | 502030 | 0 | 502030 | Other - Recreation | Other - Recreation | 564258.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1.847034e+07 | 549.11 | False | Compliant | NaN |
| 6613 | 2016 | NonResidential | Other | SMILOW RAINIER VISTA BOYS & GIRLS CLUB | 2 | SOUTHEAST | 2009 | 1.0 | 0.0 | 40265 | 0 | 40265 | Pre-school/Daycare | Pre-school/Daycare | 40265.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2.159170e+06 | 51.39 | False | Compliant | NaN |
| 6614 | 2016 | Campus | University | UNIVERSITY OF WASHINGTON - SEATTLE CAMPUS | 4 | NORTHEAST | 1900 | 111.0 | 0.0 | 9320156 | 0 | 9320156 | College/University | College/University | 9320156.0 | NaN | NaN | NaN | NaN | NaN | NaN | 8.739237e+08 | 11140.56 | False | Compliant | NaN |
| 6616 | 2016 | NonResidential | Residence Hall | CEDAR HALL | 4 | NORTHEAST | 2011 | 1.0 | 0.0 | 176225 | 0 | 176225 | Parking, Residence Hall/Dormitory | Residence Hall/Dormitory | 176225.0 | Parking | 24000.0 | NaN | NaN | NaN | 91.0 | 4.926695e+06 | 103.65 | False | Compliant | NaN |
| 6618 | 2016 | NonResidential | Residence Hall | LANDER HALL | 4 | NORTHEAST | 1953 | 1.0 | 0.0 | 166991 | 0 | 166991 | Residence Hall/Dormitory | Residence Hall/Dormitory | 166991.0 | NaN | NaN | NaN | NaN | NaN | 19.0 | 1.596586e+07 | 437.20 | False | Compliant | NaN |
| 6619 | 2016 | NonResidential | Residence Hall | MERCER HALL | 4 | NORTHEAST | 1970 | 1.0 | 0.0 | 89392 | 0 | 89392 | Residence Hall/Dormitory | Residence Hall/Dormitory | 89392.0 | NaN | NaN | NaN | NaN | NaN | 16.0 | 6.960904e+06 | 48.53 | False | Compliant | NaN |
| 6620 | 2016 | NonResidential | Residence Hall | POPLAR HALL | 4 | NORTHEAST | 2011 | 1.0 | 0.0 | 97040 | 0 | 97040 | Residence Hall/Dormitory | Residence Hall/Dormitory | 97040.0 | NaN | NaN | NaN | NaN | NaN | 69.0 | 3.894142e+06 | 62.41 | False | Compliant | NaN |
# Correction du nombre d'étages : au moins 1 étage
data['NumberofFloors']=data['NumberofFloors'].apply(lambda x: x if x>0 else 1)
# Correction du nombre de bâtiments: au moins 1
data['NumberofBuildings']=data['NumberofBuildings'].apply(lambda x: x if x>0 else 1)
data.drop(data[data['SiteEnergyUse(kBtu)'] <= 0].index,inplace=True)
data.shape
(3404, 26)
data.drop(data[data['TotalGHGEmissions'] <= 0].index,inplace=True)
data.shape
(3401, 26)
# Function calculant le nombre de valeurs manquantes (Nan) par colonnes
def missing_values(table,tri=True,sens=True):
# Total Nan
mis_val = table.isnull().sum()
# Pourcentage de Nan
mis_val_percent = table.isnull().sum() / len(table) * 100
# Data Frame contenant les résultats
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Nommage des colonnes
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Valeurs manquantes', 1 : '% du Total des valeurs'})
if tri:
# Avec tri sur le pourcentage de valeurs manquantesSort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% du Total des valeurs', ascending=sens).round(2)
else:
# Sans tri sur le poucentage de valeurs manquantes
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% du Total des valeurs', ascending=False).round(2)
# Affichage du nom de la table analysée
# du nombre de colonnes concernées par des valeurs manquantes
display (" Data Frame a " + str(table.shape[1]) + " colonnes.\n"
"Dont " + str(mis_val_table_ren_columns.shape[0]) +
" colonnes contiennent des valeurs manquantes.")
# Retourne la tables contenant les stats par colonnes
return mis_val_table_ren_columns
missing_values(data,True,False)
' Data Frame a 26 colonnes.\nDont 10 colonnes contiennent des valeurs manquantes.'
| Valeurs manquantes | % du Total des valeurs | |
|---|---|---|
| Outlier | 3366 | 98.97 |
| YearsENERGYSTARCertified | 3282 | 96.50 |
| ThirdLargestPropertyUseType | 2800 | 82.33 |
| ThirdLargestPropertyUseTypeGFA | 2800 | 82.33 |
| SecondLargestPropertyUseType | 1706 | 50.16 |
| SecondLargestPropertyUseTypeGFA | 1706 | 50.16 |
| ENERGYSTARScore | 831 | 24.43 |
| LargestPropertyUseType | 27 | 0.79 |
| LargestPropertyUseTypeGFA | 27 | 0.79 |
| ListOfAllPropertyUseTypes | 16 | 0.47 |
Gestion des Nan de Outlier
data['Outlier']=data['Outlier'].str.lower()
data['Outlier'].value_counts()
low outlier 26 high outlier 9 Name: Outlier, dtype: int64
mask_high=data['Outlier']== 'high outlier'
data[mask_high]
| DataYear | BuildingType | PrimaryPropertyType | PropertyName | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEnergyUse(kBtu) | TotalGHGEmissions | DefaultData | ComplianceStatus | Outlier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3566 | 2016 | NonResidential | Large Office | SECOND AND SPRING BUILDING | 7 | DOWNTOWN | 1958 | 1.0 | 6.0 | 172842 | 25920 | 146922 | Data Center, Office, Parking | Office | 99890.0 | Data Center | 39336.0 | Parking | 25000.0 | NaN | NaN | 41399504.0 | 362.66 | False | Non-Compliant | high outlier |
| 3857 | 2016 | NonResidential | Large Office | 401 ELLIOTT AVE WEST | 7 | MAGNOLIA / QUEEN ANNE | 2000 | 1.0 | 4.0 | 129551 | 42500 | 87051 | Data Center, Office, Parking | Office | 82273.0 | Parking | 42500.0 | Data Center | 4778.0 | NaN | 1.0 | 27137190.0 | 189.18 | False | Non-Compliant | high outlier |
| 5047 | 2016 | Multifamily LR (1-4) | Mid-Rise Multifamily | TAYLOR ANNE CONDOMINIUM | 7 | MAGNOLIA / QUEEN ANNE | 1969 | 1.0 | 5.0 | 30569 | 0 | 30569 | Multifamily Housing | Multifamily Housing | 30921.0 | NaN | NaN | NaN | NaN | NaN | 1.0 | 6834431.0 | 54.25 | False | Non-Compliant | high outlier |
| 6103 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | ATLAS BUILDING | 2 | DOWNTOWN | 1920 | 1.0 | 4.0 | 46560 | 0 | 46560 | Multifamily Housing, Restaurant, Retail Store | Multifamily Housing | 46560.0 | Retail Store | 3000.0 | Restaurant | 2000.0 | NaN | NaN | 6881405.5 | 284.42 | False | Non-Compliant | high outlier |
| 6203 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | ST. JOHNS APARTMENTS | 3 | EAST | 1910 | 1.0 | 3.0 | 24682 | 0 | 24682 | Multifamily Housing, Retail Store | Multifamily Housing | 17822.0 | Retail Store | 6085.0 | NaN | NaN | NaN | 74.0 | 2681771.0 | 96.18 | False | Non-Compliant | high outlier |
| 6262 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | VALI HAI | 7 | MAGNOLIA / QUEEN ANNE | 1959 | 2.0 | 4.0 | 43346 | 0 | 43346 | Multifamily Housing, Parking, Swimming Pool | Multifamily Housing | 43346.0 | Parking | 12600.0 | Swimming Pool | 0.0 | NaN | 5.0 | 4519115.0 | 148.02 | False | Non-Compliant | high outlier |
| 6422 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | BUILDING 4 | 5 | NORTH | 2008 | 1.0 | 4.0 | 46725 | 0 | 46725 | Multifamily Housing | Multifamily Housing | 102522.0 | NaN | NaN | NaN | NaN | NaN | 1.0 | 13459221.0 | 531.96 | False | Non-Compliant | high outlier |
| 6432 | 2016 | Multifamily MR (5-9) | Mid-Rise Multifamily | COMMODORE DUCHESS APARTMENTS | 4 | NORTHEAST | 1925 | 1.0 | 8.0 | 97923 | 0 | 97923 | Multifamily Housing | Multifamily Housing | 70143.0 | Office | 220.0 | NaN | NaN | NaN | 5.0 | 10343027.0 | 686.53 | False | Non-Compliant | high outlier |
| 6605 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | STEWART HOUSE | 7 | DOWNTOWN | 1904 | 1.0 | 3.0 | 49000 | 0 | 49000 | Food Sales, Multifamily Housing, Restaurant, R... | Multifamily Housing | 21600.0 | Food Sales | 5663.0 | Retail Store | 4027.0 | NaN | NaN | 4602793.5 | 78.39 | False | Non-Compliant | high outlier |
mask_low=data['Outlier']== 'low outlier'
data[mask_low]
| DataYear | BuildingType | PrimaryPropertyType | PropertyName | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEnergyUse(kBtu) | TotalGHGEmissions | DefaultData | ComplianceStatus | Outlier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 702 | 2015 | NonResidential | Small- and Mid-Sized Office | 1916 BOREN BUILDING | 7 | DOWNTOWN | 1929 | 1.0 | 3.0 | 21600 | 0 | 21600 | Office | Office | 21600.0 | NaN | NaN | NaN | NaN | NaN | NaN | 1.338800e+05 | 0.93 | No | Compliant | low outlier |
| 1577 | 2015 | NonResidential | Small- and Mid-Sized Office | SALTY DOG POTTERY & THE BOATWRIGHT | 6 | BALLARD | 1903 | 1.0 | 3.0 | 24913 | 0 | 24913 | Office | Office | 24913.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 2.382550e+05 | 3.59 | No | Compliant | low outlier |
| 2055 | 2015 | Multifamily LR (1-4) | Low-Rise Multifamily | BRISTOL APARTMENTS | 3 | EAST | 1908 | 1.0 | 3.0 | 22308 | 0 | 22308 | Multifamily Housing | Multifamily Housing | 22308.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 1.948390e+05 | 1.38 | No | Compliant | low outlier |
| 3724 | 2016 | NonResidential | Retail Store | UNIVERSITY CENTER | 4 | NORTHEAST | 1987 | 1.0 | 2.0 | 69492 | 0 | 69492 | Retail Store | Retail Store | 69800.0 | Other | 0.0 | NaN | NaN | NaN | 100.0 | 3.189628e+05 | 2.22 | False | Non-Compliant | low outlier |
| 3788 | 2016 | NonResidential | Large Office | 411 1ST AVE S (ID608) | 2 | DOWNTOWN | 1913 | 5.0 | 7.0 | 154159 | 0 | 154159 | Office | Office | 193154.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 1.119592e+07 | 29.43 | False | Non-Compliant | low outlier |
| 4060 | 2016 | Multifamily HR (10+) | High-Rise Multifamily | BAY VISTA | 7 | DOWNTOWN | 1982 | 1.0 | 24.0 | 167941 | 0 | 167941 | Multifamily Housing, Swimming Pool | Multifamily Housing | 138787.0 | Parking | 99952.0 | Swimming Pool | 0.0 | NaN | 100.0 | 2.407904e+06 | 29.25 | False | Non-Compliant | low outlier |
| 4174 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | BELMONT LOFTS | 3 | EAST | 1989 | 1.0 | 4.0 | 31702 | 0 | 31702 | Multifamily Housing, Parking | Multifamily Housing | 26418.0 | Parking | 14280.0 | NaN | NaN | NaN | NaN | 2.559712e+05 | 1.78 | False | Non-Compliant | low outlier |
| 4179 | 2016 | SPS-District K-12 | K-12 School | QUEEN ANNE GYM | 7 | MAGNOLIA / QUEEN ANNE | 2001 | 1.0 | 1.0 | 35805 | 0 | 35805 | K-12 School | K-12 School | 35805.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 4.314717e+05 | 11.54 | True | Error - Correct Default Data | low outlier |
| 4302 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | CITY LIGHTS ON HARBOR | 1 | SOUTHWEST | 1990 | 1.0 | 3.0 | 70207 | 0 | 70207 | Multifamily Housing | Multifamily Housing | 71273.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 7.024814e+05 | 4.90 | False | Non-Compliant | low outlier |
| 4385 | 2016 | NonResidential | Retail Store | SEATTLE HABITAT STORE | 2 | GREATER DUWAMISH | 1953 | 1.0 | 1.0 | 39900 | 0 | 39900 | Non-Refrigerated Warehouse, Retail Store | Retail Store | 25000.0 | Non-Refrigerated Warehouse | 15000.0 | NaN | NaN | NaN | 100.0 | 1.454688e+05 | 5.46 | True | Error - Correct Default Data | low outlier |
| 4569 | 2016 | NonResidential | Small- and Mid-Sized Office | 1518 FIFITH AVE | 7 | DOWNTOWN | 1903 | 1.0 | 3.0 | 57720 | 0 | 57720 | Office | Office | 25000.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2.410550e+04 | 0.17 | False | Non-Compliant | low outlier |
| 4635 | 2016 | NonResidential | Other | THE LUSTY LADY | 7 | DOWNTOWN | 1900 | 1.0 | 3.0 | 49760 | 0 | 49760 | Other | Other | 24019.0 | NaN | NaN | NaN | NaN | NaN | NaN | 4.429350e+04 | 0.31 | False | Non-Compliant | low outlier |
| 4681 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | DIPLOMAT CONDOMINIUMS | 1 | SOUTHWEST | 1975 | 1.0 | 4.0 | 33948 | 0 | 33948 | Multifamily Housing | Multifamily Housing | 24242.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 1.914979e+05 | 1.34 | False | Non-Compliant | low outlier |
| 4951 | 2016 | NonResidential | Worship Facility | FREEDOM CHURCH | 1 | SOUTHWEST | 1971 | 1.0 | 1.0 | 23772 | 0 | 23772 | Worship Facility | Worship Facility | 23772.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 1.008417e+05 | 0.70 | False | Non-Compliant | low outlier |
| 5228 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | ANNE VISTA APARTMENTS | 1 | SOUTHWEST | 1957 | 1.0 | 3.0 | 20184 | 0 | 20184 | Multifamily Housing | Multifamily Housing | 20184.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 1.648179e+05 | 1.15 | False | Non-Compliant | low outlier |
| 5285 | 2016 | NonResidential | Small- and Mid-Sized Office | 1416 S JACKSON | 3 | CENTRAL | 1947 | 1.0 | 1.0 | 45068 | 0 | 45068 | Office | Office | 45068.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 2.848573e+05 | 8.59 | False | Non-Compliant | low outlier |
| 5469 | 2016 | NonResidential | K-12 School | ISLAMIC SCHOOL OF SEATTLE | 3 | CENTRAL | 1929 | 1.0 | 2.0 | 24152 | 0 | 24152 | K-12 School | K-12 School | 24152.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 1.613634e+05 | 1.12 | False | Non-Compliant | low outlier |
| 5529 | 2016 | NonResidential | Other | 1701 FIRST AVE SOUTH LLC | 2 | GREATER DUWAMISH | 1910 | 1.0 | 3.0 | 27690 | 0 | 27690 | Other, Parking | Other | 24717.0 | Parking | 0.0 | NaN | NaN | NaN | NaN | 1.680890e+04 | 0.12 | False | Non-Compliant | low outlier |
| 5556 | 2016 | NonResidential | Small- and Mid-Sized Office | 2233 BUILDING | 2 | GREATER DUWAMISH | 1910 | 1.0 | 2.0 | 20970 | 0 | 20970 | Office, Parking | Office | 20970.0 | Parking | 0.0 | NaN | NaN | NaN | 100.0 | 2.044991e+05 | 5.43 | False | Non-Compliant | low outlier |
| 5693 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | ONE WEST CONDOMINIUMS | 1 | SOUTHWEST | 1984 | 1.0 | 4.0 | 26261 | 0 | 26261 | Multifamily Housing | Multifamily Housing | 26261.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 2.000566e+05 | 1.39 | False | Non-Compliant | low outlier |
| 5750 | 2016 | NonResidential | Hotel | J & M HOTEL BUILDING (ID25553) | 7 | DOWNTOWN | 1900 | 1.0 | 3.0 | 25450 | 0 | 25450 | Hotel | Hotel | 25450.0 | NaN | NaN | NaN | NaN | NaN | 99.0 | 5.037447e+05 | 3.51 | False | Non-Compliant | low outlier |
| 5790 | 2016 | NonResidential | Low-Rise Multifamily | (ID25674) COMET TAVERN | 3 | EAST | 1910 | 1.0 | 3.0 | 32100 | 0 | 32100 | Bar/Nightclub, Multifamily Housing | Multifamily Housing | 21400.0 | Bar/Nightclub | 10700.0 | NaN | NaN | NaN | NaN | 1.082004e+05 | 5.22 | False | Non-Compliant | low outlier |
| 5816 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | WEST OLYMPIC CONDOMINIUMS | 1 | SOUTHWEST | 1978 | 1.0 | 4.0 | 22000 | 0 | 22000 | Multifamily Housing, Parking | Multifamily Housing | 22000.0 | Parking | 10000.0 | NaN | NaN | NaN | 100.0 | 1.547981e+05 | 1.08 | False | Non-Compliant | low outlier |
| 6141 | 2016 | NonResidential | Worship Facility | SEATTLE COMMUNITY CHURCH | 4 | NORTHEAST | 1954 | 1.0 | 2.0 | 20039 | 0 | 20039 | Worship Facility | Worship Facility | 20039.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 1.047223e+05 | 0.73 | False | Non-Compliant | low outlier |
| 6492 | 2016 | Nonresidential COS | Other | GEORGETOWN STEAMPLANT | 2 | GREATER DUWAMISH | 1906 | 1.0 | 2.0 | 39212 | 0 | 39212 | Other | Other | 39212.0 | NaN | NaN | NaN | NaN | NaN | NaN | 7.237040e+04 | 0.50 | False | Non-Compliant | low outlier |
| 6681 | 2016 | Multifamily MR (5-9) | Mid-Rise Multifamily | THE WESTSIDE OFF BROADWAY | 3 | EAST | 2015 | 1.0 | 7.0 | 41384 | 15062 | 26322 | Multifamily Housing, Other - Restaurant/Bar, P... | Multifamily Housing | 34643.0 | Parking | 8266.0 | Personal Services (Health/Beauty, Dry Cleaning... | 1714.0 | NaN | 100.0 | 1.340900e+04 | 0.09 | False | Non-Compliant | low outlier |
Je choisis de garder cette variable pour pouvoir étudier les effets des outliers sur les modèles,
Remplacement des valeurs manquantes par 'normal'
data['Outlier'].fillna('normal',inplace=True)
missing_values(data,True)
' Data Frame a 26 colonnes.\nDont 9 colonnes contiennent des valeurs manquantes.'
| Valeurs manquantes | % du Total des valeurs | |
|---|---|---|
| ListOfAllPropertyUseTypes | 16 | 0.47 |
| LargestPropertyUseType | 27 | 0.79 |
| LargestPropertyUseTypeGFA | 27 | 0.79 |
| ENERGYSTARScore | 831 | 24.43 |
| SecondLargestPropertyUseType | 1706 | 50.16 |
| SecondLargestPropertyUseTypeGFA | 1706 | 50.16 |
| ThirdLargestPropertyUseType | 2800 | 82.33 |
| ThirdLargestPropertyUseTypeGFA | 2800 | 82.33 |
| YearsENERGYSTARCertified | 3282 | 96.50 |
Gestion des Nan de ListOfAllPropertyUseTypes
Suppression des 16 enregistrements pour lesquels on ne connaît pas la listedes usages
data.dropna(subset=['ListOfAllPropertyUseTypes'],axis=0,inplace=True)
data.shape
(3385, 26)
Gestion des Nan de LargestPropertyUseType
temp=data[data['LargestPropertyUseType'].isna()]
display(temp)
temp.shape
| DataYear | BuildingType | PrimaryPropertyType | PropertyName | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | ListOfAllPropertyUseTypes | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | YearsENERGYSTARCertified | ENERGYSTARScore | SiteEnergyUse(kBtu) | TotalGHGEmissions | DefaultData | ComplianceStatus | Outlier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3693 | 2016 | NonResidential | Self-Storage Facility | MARKET ST CENTER | 6 | BALLARD | 1946 | 2.0 | 2.0 | 111445 | 0 | 111445 | Fitness Center/Health Club/Gym, Office, Other ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.697472e+06 | 163.83 | False | Compliant | normal |
| 4417 | 2016 | Multifamily HR (10+) | High-Rise Multifamily | THE CONCORD | 7 | DOWNTOWN | 1999 | 1.0 | 13.0 | 219874 | 0 | 219874 | Multifamily Housing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 17.0 | 1.321755e+07 | 408.82 | False | Compliant | normal |
| 4487 | 2016 | NonResidential | Hotel | PALLADIAN HOTEL | 7 | DOWNTOWN | 1910 | 1.0 | 8.0 | 61721 | 0 | 61721 | Hotel | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 93.0 | 2.897080e+06 | 36.92 | False | Compliant | normal |
| 4823 | 2016 | Multifamily MR (5-9) | Mid-Rise Multifamily | 500 ELLIOTT HOMES CONDOMINIUM | 7 | MAGNOLIA / QUEEN ANNE | 2000 | 1.0 | 5.0 | 45715 | 0 | 45715 | Multifamily Housing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 72.0 | 1.289661e+06 | 8.99 | False | Compliant | normal |
| 5096 | 2016 | Multifamily HR (10+) | High-Rise Multifamily | KELLEHER HOUSE | 3 | EAST | 1982 | 1.0 | 17.0 | 76855 | 0 | 76855 | Multifamily Housing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 10.0 | 2.718223e+06 | 18.95 | False | Compliant | normal |
| 5676 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | MARIA CHALET | 5 | NORTHWEST | 1978 | 1.0 | 4.0 | 20844 | 0 | 20844 | Multifamily Housing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.597181e+05 | 3.90 | False | Compliant | normal |
| 5754 | 2016 | NonResidential | Small- and Mid-Sized Office | TALON NORTHLAKE LLC | 4 | LAKE UNION | 2008 | 1.0 | 4.0 | 48350 | 0 | 48350 | Office | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 45.0 | 3.168131e+06 | 22.09 | False | Compliant | normal |
| 5799 | 2016 | NonResidential | Restaurant | BUSH GARDEN - RESTURANT & LOUNGE | 2 | DOWNTOWN | 1913 | 1.0 | 3.0 | 28800 | 0 | 28800 | Restaurant | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8.999242e+05 | 29.21 | False | Compliant | normal |
| 5885 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | WESTWOOD PLAZA CONDOMINIUMS | 1 | DELRIDGE | 1981 | 1.0 | 3.0 | 35122 | 0 | 35122 | Multifamily Housing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 43.0 | 9.365844e+05 | 6.53 | False | Compliant | normal |
| 6226 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | RAVENNA WOODS | 5 | NORTHEAST | 1981 | 1.0 | 3.0 | 42448 | 0 | 42448 | Multifamily Housing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 66.0 | 1.247918e+06 | 8.70 | False | Compliant | normal |
| 6380 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | PINEHURST LANE | 5 | NORTH | 1997 | 1.0 | 4.0 | 23175 | 0 | 23175 | Multifamily Housing, Office | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7.152526e+05 | 16.31 | False | Compliant | normal |
(11, 26)
Pour ces enregistrements on ne connaît pas non plus la surface dédiée à l'usage donc je choisis de supprimer
data.dropna(subset=['LargestPropertyUseType'],axis=0,inplace=True)
data.shape
(3374, 26)
Gestion des Nan de SecondLargest et ThirdLargest
regardons la liste de tous les usages du batiment, si les second et troisièmes usages y figurent on pourra compléter les valeurs manquantes
modalités=pd.DataFrame(data['ListOfAllPropertyUseTypes'].value_counts())
display(modalités)
| ListOfAllPropertyUseTypes | |
|---|---|
| Multifamily Housing | 866 |
| Multifamily Housing, Parking | 467 |
| Office | 143 |
| Office, Parking | 123 |
| K-12 School | 122 |
| ... | ... |
| Multifamily Housing, Other - Education, Other - Entertainment/Public Assembly, Other - Restaurant/Bar | 1 |
| Medical Office, Office, Other | 1 |
| K-12 School, Office, Retail Store | 1 |
| Office, Other, Other - Restaurant/Bar, Strip Mall | 1 |
| Office, Other, Other - Lodging/Residential, Restaurant, Social/Meeting Hall | 1 |
468 rows × 1 columns
display('Avant: ',data.shape)
data['ListOfAllPropertyUseTypes']=data['ListOfAllPropertyUseTypes'].apply(lambda s :s.replace(s[ s.find( '(' ) : s.find( ')' )+1 ],s[ s.find( '(' ) : s.find( ')' )+1 ].replace(', ','/')))
test=data['ListOfAllPropertyUseTypes'].str.split(pat=', ',expand=True)
# test=data.pop('ListOfAllPropertyUseTypes').str.split(pat=', ',expand=True)
test.drop(0,axis=1,inplace=True)
test.fillna('None',axis=1,inplace=True)
test.columns=['Use'+str(s) for s in test.columns]
data=pd.concat([data,test],axis=1)
display('Aprés :',data.shape)
'Avant: '
(3374, 26)
'Aprés :'
(3374, 34)
suppression de la variable initiale
data.drop('ListOfAllPropertyUseTypes',axis=1,inplace=True)
Use2 correspond à SecondLargestPropertyUseType
Use3 correspond à ThirdLargestPropertyUseType
data[data['SecondLargestPropertyUseTypeGFA'].isna()]['SecondLargestPropertyUseType'].unique()
array([nan], dtype=object)
data[data['SecondLargestPropertyUseTypeGFA'].isna()]['Use2'].unique()
array(['None', 'Parking'], dtype=object)
data[data['ThirdLargestPropertyUseType'].isna()]['ThirdLargestPropertyUseType'].unique()
array([nan], dtype=object)
data[data['ThirdLargestPropertyUseType'].isna()]['Use3'].unique()
array(['None'], dtype=object)
On peut donc imputer None et 0 aux valeurs manquantes
data['SecondLargestPropertyUseTypeGFA'].fillna(0,inplace=True)
data['SecondLargestPropertyUseType'].fillna('None',inplace=True)
data['ThirdLargestPropertyUseTypeGFA'].fillna(0,inplace=True)
data['ThirdLargestPropertyUseType'].fillna('None',inplace=True)
Suppression des colonnes Use1, Use2 et Use3
data.drop(['Use1','Use2','Use3'],axis=1,inplace=True)
missing_values(data)
' Data Frame a 30 colonnes.\nDont 2 colonnes contiennent des valeurs manquantes.'
| Valeurs manquantes | % du Total des valeurs | |
|---|---|---|
| ENERGYSTARScore | 823 | 24.39 |
| YearsENERGYSTARCertified | 3255 | 96.47 |
# YearsENERGYSTARCertified a trop de valeurs manquantes pour pouvoir être exploitée
data.drop('YearsENERGYSTARCertified',axis=1,inplace=True)
# #fonction permettant de tracer des boxplots simples avec distribution
# import matplotlib.pyplot as plt
def Distribution(Sx,titre='',unit_text='',unite=1,fliers=True,tab=10):
# série à traiter, titre de la figure, xlabel, unité abscisses, legende unité , tab= espacement des affichages text
# fliers True/False
f, (ax1, ax2) = plt.subplots(2, 1,figsize=(10,4),sharex=True, gridspec_kw={'height_ratios': [3, 1],'hspace':.3})
f.suptitle(titre,y=1,fontsize=20)
# Boxplot
sns.boxplot(x=Sx,ax=ax2,orient='h',showfliers = fliers, showmeans=True,meanprops={"marker":"d"})
xmin, xmax, ymin, ymax = ax2.axis()
# Formatage des valeurs du bandeau moyenne, ecart type,....
def form (x): return str('{:.1f}'.format(x)).ljust(tab)
# Formatage des unités abscisses
val=ax2.get_xticks()
ax2.set_xticklabels(val/unite)
ax2.tick_params(labelsize=14)
# Affichage unités abscisses
ax2.set_xlabel(unit_text,x=1, ha='right',fontsize=14)
# Ajout des indicateurs statistiques
print(type(unite))
if fliers:
ax2.set_title('Min= '+form(Sx.min()/unite)+
'Médiane= '+form(Sx.median()/unite)+
'Moyenne= '+form(Sx.mean()/unite)+
'\u03C3= '+form(Sx.std()/unite)+
'Max= '+form(Sx.max()/unite),
fontsize=14)
# Histogramme
ax1.set_title(' skewness : '+str(Sx.skew())+' kurtosis : '+str(Sx.kurtosis()))
ax1.tick_params(labelsize=14)
ax1.set_xlim(xmin,xmax)
sns.histplot(data=Sx,stat='count',bins='auto',discrete=False,kde=True,ax=ax1)
# plt.show()
return
consommation du batiment
cible1='SiteEnergyUse(kBtu)'
Distribution(data[cible1],'Distribution de '+cible1,'(en millions de kBtu)',unite=1E6)
Distribution(data[cible1],'Distribution de '+cible1+' sans outliers','(en millions de kBtu)',unite=1E6,fliers=False)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'>
cible2='TotalGHGEmissions'
Distribution(data[cible2],'Distribution de '+cible2,'(en milliers de tonnes CO2)',unite=1E3)
Distribution(data[cible2],'Distribution de '+cible2+' sans outliers','(en milliers de tonnes CO2)',unite=1E3,fliers=False)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'>
Changement de l'ordre des colonnes pour avoir la variable SiteEnergyUse(kBtu) et TotalGHGEmissions, comme premières colonnes de la table
temp=data.pop('TotalGHGEmissions')
data=pd.concat([temp,data],axis=1)
temp=data.pop('SiteEnergyUse(kBtu)')
data=pd.concat([temp,data],axis=1)
data.sample(5)
| SiteEnergyUse(kBtu) | TotalGHGEmissions | DataYear | BuildingType | PrimaryPropertyType | PropertyName | CouncilDistrictCode | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | PropertyGFAParking | PropertyGFABuilding(s) | LargestPropertyUseType | LargestPropertyUseTypeGFA | SecondLargestPropertyUseType | SecondLargestPropertyUseTypeGFA | ThirdLargestPropertyUseType | ThirdLargestPropertyUseTypeGFA | ENERGYSTARScore | DefaultData | ComplianceStatus | Outlier | Use4 | Use5 | Use6 | Use7 | Use8 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5107 | 1.045838e+06 | 44.26 | 2016 | NonResidential | Other | LAKE CITY | 5 | NORTH | 1969 | 1.0 | 1.0 | 30160 | 0 | 30160 | Other | 30139.0 | None | 0.0 | None | 0.0 | NaN | False | Compliant | normal | None | None | None | None | None |
| 5045 | 1.750362e+06 | 12.20 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | TAVONA CONDO HOA | 4 | LAKE UNION | 2005 | 1.0 | 4.0 | 66013 | 0 | 66013 | Multifamily Housing | 48550.0 | Parking | 12032.0 | None | 0.0 | 36.0 | False | Compliant | normal | None | None | None | None | None |
| 5366 | 8.063703e+05 | 5.62 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | SS055 - 9818 5TH AVE NE | 5 | NORTH | 1990 | 1.0 | 4.0 | 27344 | 0 | 27344 | Multifamily Housing | 21616.0 | Parking | 5728.0 | None | 0.0 | 31.0 | False | Compliant | normal | None | None | None | None | None |
| 4161 | 3.603765e+06 | 25.12 | 2016 | Multifamily MR (5-9) | Mid-Rise Multifamily | SIDNEY APARTMENTS | 7 | DOWNTOWN | 2000 | 1.0 | 6.0 | 160141 | 40377 | 119764 | Multifamily Housing | 132400.0 | None | 0.0 | None | 0.0 | 74.0 | False | Compliant | normal | None | None | None | None | None |
| 6001 | 5.715208e+05 | 3.98 | 2016 | Multifamily LR (1-4) | Low-Rise Multifamily | QUEEN ANNE 20 | 7 | MAGNOLIA / QUEEN ANNE | 1977 | 1.0 | 4.0 | 29088 | 0 | 29088 | Multifamily Housing | 24288.0 | Parking | 4860.0 | None | 0.0 | 92.0 | False | Compliant | normal | None | None | None | None | None |
Lst_qual=pd.DataFrame({'Variables quantitatives':data.iloc[:,1:].select_dtypes('object').columns}).T
Lst_qual.T
| Variables quantitatives | |
|---|---|
| 0 | BuildingType |
| 1 | PrimaryPropertyType |
| 2 | PropertyName |
| 3 | Neighborhood |
| 4 | LargestPropertyUseType |
| 5 | SecondLargestPropertyUseType |
| 6 | ThirdLargestPropertyUseType |
| 7 | DefaultData |
| 8 | ComplianceStatus |
| 9 | Outlier |
| 10 | Use4 |
| 11 | Use5 |
| 12 | Use6 |
| 13 | Use7 |
| 14 | Use8 |
def qual_plot(table,col,kind='count'):
g=sns.catplot(data=table,y=col,kind=kind,orient="h", height=10, aspect=1)
(g.set_axis_labels('Effectifs',col )
.set_titles('Titre'))
plt.show()
modalités=data['BuildingType'].value_counts()
modalités
NonResidential 1472 Multifamily LR (1-4) 1016 Multifamily MR (5-9) 583 Multifamily HR (10+) 108 Nonresidential COS 85 SPS-District K-12 84 Campus 25 Nonresidential WA 1 Name: BuildingType, dtype: int64
# Consigne explicite du projet
# votre équipe s’intéresse de près aux émissions des bâtiments non destinés à l’habitation.
#
BuildingType_a_exclure=['Multifamily LR (1-4)',
'Multifamily MR (5-9)',
'Multifamily HR (10+)',
# catégories sous représentées
# 'Campus',
# 'Nonresidential COS',
# 'SPS-District K-12'
]
data.drop(data[data['BuildingType'].isin(BuildingType_a_exclure)].index,axis=0,inplace=True)
data.shape
(1667, 29)
sns.boxplot(data=data,x='SiteEnergyUse(kBtu)',y='BuildingType',showfliers=True)
<AxesSubplot:xlabel='SiteEnergyUse(kBtu)', ylabel='BuildingType'>
display(cible1,pg.welch_anova(data=data,dv=cible1,between='BuildingType'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='BuildingType'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | BuildingType | 4 | 148.641515 | 11.957001 | 1.890277e-08 | 0.095127 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | BuildingType | 4 | 153.578189 | 4.923256 | 0.000921 | 0.084526 |
modalités=data['PrimaryPropertyType'].value_counts()
display(modalités)
modalités.shape
Small- and Mid-Sized Office 296 Other 254 Warehouse 187 Large Office 176 K-12 School 126 Mixed Use Property 113 Retail Store 92 Hotel 75 Worship Facility 71 Distribution Center 53 Medical Office 42 Supermarket / Grocery Store 39 Self-Storage Facility 27 University 24 Residence Hall 21 Senior Care Community 20 Refrigerated Warehouse 12 Restaurant 11 Laboratory 10 Hospital 10 Office 3 Low-Rise Multifamily 3 Non-Refrigerated Warehouse 1 Restaurant\n 1 Name: PrimaryPropertyType, dtype: int64
(24,)
Formatage des noms des catégories
data['PrimaryPropertyType']=data['PrimaryPropertyType'].apply(lambda x: x.replace('\n',''))
data['PrimaryPropertyType']=data['PrimaryPropertyType'].apply(lambda x: x.replace(' / ','/'))
data['PrimaryPropertyType']=data['PrimaryPropertyType'].apply(lambda x: x.replace('/Dormitory',''))
data['PrimaryPropertyType']=data['PrimaryPropertyType'].apply(lambda x: x.replace('Non-Refrigerated Warehouse','Warehouse'))
# data['PrimaryPropertyType']=data['PrimaryPropertyType'].apply(lambda x: x.replace('College/',''))
data.loc[data[data['PrimaryPropertyType']=='Office'].index,'PrimaryPropertyType']='Small- and Mid-Sized Office'
data.drop(data[data['PrimaryPropertyType'].str.contains('Multifamily')].index,axis=0,inplace=True)
modalités=data['PrimaryPropertyType'].value_counts()
display(modalités)
modalités.shape
Small- and Mid-Sized Office 299 Other 254 Warehouse 188 Large Office 176 K-12 School 126 Mixed Use Property 113 Retail Store 92 Hotel 75 Worship Facility 71 Distribution Center 53 Medical Office 42 Supermarket/Grocery Store 39 Self-Storage Facility 27 University 24 Residence Hall 21 Senior Care Community 20 Refrigerated Warehouse 12 Restaurant 12 Hospital 10 Laboratory 10 Name: PrimaryPropertyType, dtype: int64
(20,)
sns.boxplot(data=data,x='SiteEnergyUse(kBtu)',y='PrimaryPropertyType',showfliers=False)
<AxesSubplot:xlabel='SiteEnergyUse(kBtu)', ylabel='PrimaryPropertyType'>
display(cible1,pg.welch_anova(data=data,dv=cible1,between='PrimaryPropertyType'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='PrimaryPropertyType'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | PrimaryPropertyType | 19 | 191.409525 | 39.854846 | 3.875726e-56 | 0.182479 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | PrimaryPropertyType | 19 | 188.53503 | 22.951713 | 3.438601e-39 | 0.287583 |
modalités=pd.DataFrame(data['PropertyName'].value_counts())
display(modalités)
modalités.shape
| PropertyName | |
|---|---|
| CENTENNIAL BUILDING | 2 |
| BAYVIEW BUILDING | 2 |
| DEPT OF SOCIAL & HEALTH SERVICES | 2 |
| AIRPORT WAY | 2 |
| SOUTH PARK | 2 |
| ... | ... |
| US BANK CENTRE | 1 |
| ENTIRE CAMPUS | 1 |
| VIRGINIA MASON MEDICAL CENTER - 2149 | 1 |
| UNIVERSITY BOOK STORE | 1 |
| EMERSON ELEMENTARY | 1 |
1658 rows × 1 columns
(1658, 1)
modalités=pd.DataFrame(data['Neighborhood'].value_counts())
display(modalités)
modalités.shape
| Neighborhood | |
|---|---|
| DOWNTOWN | 363 |
| GREATER DUWAMISH | 348 |
| MAGNOLIA / QUEEN ANNE | 151 |
| LAKE UNION | 147 |
| NORTHEAST | 127 |
| EAST | 124 |
| NORTHWEST | 82 |
| BALLARD | 63 |
| NORTH | 56 |
| CENTRAL | 47 |
| SOUTHEAST | 46 |
| DELRIDGE | 41 |
| SOUTHWEST | 40 |
| North | 9 |
| Ballard | 6 |
| Northwest | 5 |
| Delridge | 4 |
| Central | 4 |
| DELRIDGE NEIGHBORHOODS | 1 |
(19, 1)
Formatage des noms de quartier
data['Neighborhood']=data['Neighborhood'].apply(lambda s : s[:8] if s=='DELRIDGE NEIGHBORHOODS' else s)
data['Neighborhood']=data['Neighborhood'].apply(lambda s : s.capitalize())
modalités=pd.DataFrame(data['Neighborhood'].value_counts())
display(modalités)
modalités.shape
| Neighborhood | |
|---|---|
| Downtown | 363 |
| Greater duwamish | 348 |
| Magnolia / queen anne | 151 |
| Lake union | 147 |
| Northeast | 127 |
| East | 124 |
| Northwest | 87 |
| Ballard | 69 |
| North | 65 |
| Central | 51 |
| Southeast | 46 |
| Delridge | 46 |
| Southwest | 40 |
(13, 1)
sns.boxplot(data=data,x='SiteEnergyUse(kBtu)',y='Neighborhood',showfliers=False)
<AxesSubplot:xlabel='SiteEnergyUse(kBtu)', ylabel='Neighborhood'>
display(cible1,pg.welch_anova(data=data,dv=cible1,between='Neighborhood'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='Neighborhood'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | Neighborhood | 12 | 401.212962 | 5.696194 | 4.365162e-09 | 0.015005 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | Neighborhood | 12 | 395.106282 | 3.490705 | 0.000064 | 0.016058 |
modalités=data['LargestPropertyUseType'].value_counts()
display(modalités)
modalités.shape
Office 506 Non-Refrigerated Warehouse 200 K-12 School 126 Other 101 Retail Store 100 Hotel 76 Worship Facility 71 Distribution Center 54 Medical Office 44 Supermarket/Grocery Store 40 Other - Recreation 31 Parking 30 Self-Storage Facility 27 College/University 24 Residence Hall/Dormitory 22 Other - Entertainment/Public Assembly 21 Senior Care Community 20 Laboratory 13 Restaurant 13 Refrigerated Warehouse 12 Social/Meeting Hall 10 Hospital (General Medical & Surgical) 10 Multifamily Housing 9 Manufacturing/Industrial Plant 8 Automobile Dealership 6 Repair Services (Vehicle, Shoe, Locksmith, etc) 6 Strip Mall 6 Other - Services 5 Museum 5 Other - Lodging/Residential 5 Fitness Center/Health Club/Gym 5 Performing Arts 4 Library 4 Urgent Care/Clinic/Other Outpatient 4 Other - Mall 4 Financial Office 4 Bank Branch 4 Other/Specialty Hospital 4 Data Center 3 Other - Education 3 Prison/Incarceration 3 Other - Public Services 2 Lifestyle Center 2 Adult Education 2 Pre-school/Daycare 2 Other - Restaurant/Bar 2 Other - Utility 2 Personal Services (Health/Beauty, Dry Cleaning, etc) 1 Wholesale Club/Supercenter 1 Fire Station 1 Movie Theater 1 Police Station 1 Outpatient Rehabilitation/Physical Therapy 1 Courthouse 1 Residential Care Facility 1 Food Service 1 Name: LargestPropertyUseType, dtype: int64
(56,)
display(cible1,pg.welch_anova(data=data,dv=cible1,between='LargestPropertyUseType'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='LargestPropertyUseType'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | LargestPropertyUseType | 55 | 86.340238 | 10.164725 | 5.818067e-21 | 0.194437 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | LargestPropertyUseType | 55 | 90.548054 | 29.32148 | 1.597624e-39 | 0.287042 |
qual_plot(data,'LargestPropertyUseType')
modalités=data['SecondLargestPropertyUseType'].value_counts()
display(modalités)
modalités.shape
None 802 Parking 342 Office 169 Retail Store 84 Other 45 Non-Refrigerated Warehouse 30 Restaurant 29 Fitness Center/Health Club/Gym 16 Data Center 13 Multifamily Housing 12 Other - Services 11 Supermarket/Grocery Store 10 Medical Office 9 Laboratory 8 Bank Branch 7 Other - Entertainment/Public Assembly 7 Distribution Center 7 K-12 School 5 Swimming Pool 4 Repair Services (Vehicle, Shoe, Locksmith, etc) 4 Other - Recreation 4 Movie Theater 3 Hotel 3 Worship Facility 3 Self-Storage Facility 3 Other - Public Services 3 Automobile Dealership 2 Adult Education 2 Other - Restaurant/Bar 2 Refrigerated Warehouse 2 Food Sales 2 Other - Education 2 Bar/Nightclub 2 Performing Arts 2 Manufacturing/Industrial Plant 2 Financial Office 1 Food Service 1 Pre-school/Daycare 1 Social/Meeting Hall 1 Convenience Store without Gas Station 1 Hospital (General Medical & Surgical) 1 College/University 1 Personal Services (Health/Beauty, Dry Cleaning, etc) 1 Vocational School 1 Courthouse 1 Other - Lodging/Residential 1 Residence Hall/Dormitory 1 Enclosed Mall 1 Name: SecondLargestPropertyUseType, dtype: int64
(48,)
display(cible1,pg.welch_anova(data=data,dv=cible1,between='SecondLargestPropertyUseType'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='SecondLargestPropertyUseType'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | SecondLargestPropertyUseType | 47 | 57.572553 | 1.884165 | 0.01123 | 0.04274 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | SecondLargestPropertyUseType | 47 | 59.50056 | 1.981444 | 0.006457 | 0.056751 |
qual_plot(data,'SecondLargestPropertyUseType')
modalités=data['ThirdLargestPropertyUseType'].value_counts()
modalités
None 1310 Office 49 Retail Store 48 Parking 46 Other 41 Restaurant 31 Non-Refrigerated Warehouse 15 Data Center 14 Multifamily Housing 12 Swimming Pool 10 Medical Office 9 Pre-school/Daycare 8 Food Service 8 Social/Meeting Hall 8 Other - Restaurant/Bar 5 Bank Branch 5 Financial Office 4 Other - Entertainment/Public Assembly 4 Fitness Center/Health Club/Gym 4 Distribution Center 3 Refrigerated Warehouse 2 Convenience Store without Gas Station 2 Personal Services (Health/Beauty, Dry Cleaning, etc) 2 Self-Storage Facility 2 Other - Technology/Science 2 Other - Recreation 2 Other - Services 2 Laboratory 2 Other - Education 2 K-12 School 2 Other - Utility 1 Strip Mall 1 Fast Food Restaurant 1 Supermarket/Grocery Store 1 Other/Specialty Hospital 1 Manufacturing/Industrial Plant 1 Worship Facility 1 Bar/Nightclub 1 Vocational School 1 Hotel 1 Name: ThirdLargestPropertyUseType, dtype: int64
display(cible1,pg.welch_anova(data=data,dv=cible1,between='ThirdLargestPropertyUseType'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='ThirdLargestPropertyUseType'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | ThirdLargestPropertyUseType | 39 | 42.974268 | 2.880639 | 0.000449 | 0.024533 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | ThirdLargestPropertyUseType | 39 | 44.314947 | 2.459776 | 0.002065 | 0.031423 |
qual_plot(data,'ThirdLargestPropertyUseType')
pd.DataFrame(data['DefaultData'].value_counts())
| DefaultData | |
|---|---|
| False | 1554 |
| True | 87 |
| No | 19 |
| Yes | 4 |
pd.DataFrame(data['DefaultData'].value_counts()).index
Index([False, True, 'No', 'Yes'], dtype='object')
data['DefaultData']=data['DefaultData'].apply(lambda s : 'No' if s==False else s)
data['DefaultData']=data['DefaultData'].apply(lambda s : 'Yes' if s==True else s)
pd.DataFrame(data['DefaultData'].value_counts())
| DefaultData | |
|---|---|
| No | 1573 |
| Yes | 91 |
display(cible1,pg.welch_anova(data=data,dv=cible1,between='DefaultData'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='DefaultData'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | DefaultData | 1 | 1592.361219 | 51.459876 | 1.115271e-12 | 0.001988 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | DefaultData | 1 | 1409.36141 | 29.589493 | 6.287241e-08 | 0.001209 |
Suppression de cette variable de la liste car explication de la variance des cibles trés faible
data.drop('DefaultData',axis=1,inplace=True)
pd.DataFrame(data['ComplianceStatus'].value_counts())
| ComplianceStatus | |
|---|---|
| Compliant | 1563 |
| Error - Correct Default Data | 87 |
| Non-Compliant | 14 |
display(cible1,pg.welch_anova(data=data,dv=cible1,between='ComplianceStatus'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='ComplianceStatus'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | ComplianceStatus | 2 | 34.772955 | 24.773433 | 2.049412e-07 | 0.001957 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | ComplianceStatus | 2 | 39.751891 | 15.966818 | 0.000008 | 0.001462 |
Suppression de cette variable de la liste car explication de la variance des cibles trés faible
data.drop('ComplianceStatus',axis=1,inplace=True)
pd.DataFrame(data['Outlier'].value_counts())
| Outlier | |
|---|---|
| normal | 1646 |
| low outlier | 16 |
| high outlier | 2 |
pd.DataFrame(data['DataYear'].value_counts())
| DataYear | |
|---|---|
| 2016 | 1641 |
| 2015 | 23 |
data.drop('DataYear',axis=1,inplace=True)
data['CouncilDistrictCode'].value_counts()
7 533 2 398 3 202 4 170 5 131 1 117 6 113 Name: CouncilDistrictCode, dtype: int64
data['CouncilDistrictCode']=data['CouncilDistrictCode'].apply(str)
data['CouncilDistrictCode'].value_counts()
7 533 2 398 3 202 4 170 5 131 1 117 6 113 Name: CouncilDistrictCode, dtype: int64
display(cible1,pg.welch_anova(data=data,dv=cible1,between='CouncilDistrictCode'))
display(cible2,pg.welch_anova(data=data,dv=cible2,between='CouncilDistrictCode'))
'SiteEnergyUse(kBtu)'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | CouncilDistrictCode | 6 | 555.676425 | 9.687166 | 3.713351e-10 | 0.012442 |
'TotalGHGEmissions'
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | CouncilDistrictCode | 6 | 542.086757 | 5.43324 | 0.000018 | 0.014144 |
def rapport_cible(table,var,cible):
# Affichage du scatterplot cible vs var avec droite de régression
sns.residplot(data=table, x=var, y=cible, lowess=True, order=1, robust=True, line_kws={'color': 'red'})
plt.suptitle('Relation linéaire entre '+var+' et '+cible,y=1.05,fontsize=(14))
plt.title('R²= '+str(table[var].corr(table[cible],method='pearson').round(4)))
# Création d'une liste contenant les noms des variables quantitatives
Lst_quant=pd.DataFrame({'Variables quantitatives':data.iloc[:,2:].select_dtypes('number').columns})
Lst_quant
| Variables quantitatives | |
|---|---|
| 0 | YearBuilt |
| 1 | NumberofBuildings |
| 2 | NumberofFloors |
| 3 | PropertyGFATotal |
| 4 | PropertyGFAParking |
| 5 | PropertyGFABuilding(s) |
| 6 | LargestPropertyUseTypeGFA |
| 7 | SecondLargestPropertyUseTypeGFA |
| 8 | ThirdLargestPropertyUseTypeGFA |
| 9 | ENERGYSTARScore |
var1='YearBuilt'
Distribution(data[var1],'Distribution de '+var1,'année',unite=1)
<class 'int'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
Si l'on considère que les batiments les plus recent ou ceux aqui on été renové recement sont les mieux isolés je soustrait 2016 à cette variable
data[var1]=data[var1].apply(lambda x : 2016-x)
Distribution(data[var1],'Distribution de '+var1,'année',unite=1)
<class 'int'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
rapport_cible(data,var1,cible1)
rapport_cible(data,var1,cible2)
var2='NumberofBuildings'
Distribution(data[var2],'Distribution de '+var2,'nombre de batiments',unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
Si l'on considère que la quasi totalité des propriétés ont un seul batiment, je vais diviser toutes les valeurs des surfaces de ces propriétés par le nombre de batiments pour me ramener au cas où une propriété est constituée d'un batiment.
Pour prévoir en situation réelle les consommations d'une propriété ayant n batiments il suffira de multiplier la prédiction par le nombre de bat pour pouvoir estimer ses conso
# Liste des index des propriétés concernées
id_multi_bat=data[data[var2]>1].index
data[data[var2]>1].shape
(53, 26)
VARS=['PropertyGFATotal','PropertyGFAParking','PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA','SecondLargestPropertyUseTypeGFA','ThirdLargestPropertyUseTypeGFA',cible1,cible2]
for col in VARS:
data.loc[id_multi_bat,col]=data.loc[id_multi_bat,col]/data.loc[id_multi_bat,var2]
cette variable n'est plus utile ->drop
data.drop(var2,axis=1,inplace=True)
var4='NumberofFloors'
Distribution(data[var4],'Distribution de '+var4)
Distribution(data[var4],'Distribution de '+var4+' sans outliers',fliers=False)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite) <ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'> <class 'int'>
rapport_cible(data,var4,cible1)
rapport_cible(data,var4,cible2)
var5='PropertyGFATotal'
Distribution(data[var5],'Distribution de '+var5,'(en milliers de pieds²)',unite=1E3)
Distribution(data[var5].apply(np.log1p),'Distribution de log( '+var5+' +1)',unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite) <ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'> <class 'int'>
Cette distribution est plus normale je l'intègre aux data
data[str('log_'+var5)]=data[var5].apply(np.log1p)
rapport_cible(data,var5,cible1)
rapport_cible(data,var5,cible2)
var6='PropertyGFAParking'
Distribution(data[var6],'Distribution de '+var6,'(en milliers de pieds²)',unite=1E3)
Distribution(data[var6].apply(np.log1p),'Distribution de log( '+var6+' +1)',unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite) <ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'> <class 'int'>
Cette distribution est plus normale je l'intègre aux data
data[str('log_'+var6)]=data[var6].apply(np.log1p)
rapport_cible(data,var6,cible1)
C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\statsmodels\nonparametric\smoothers_lowess.py:214: RuntimeWarning: invalid value encountered in true_divide res, _ = _lowess(y, x, x, np.ones_like(x),
rapport_cible(data,var6,cible2)
C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\statsmodels\nonparametric\smoothers_lowess.py:214: RuntimeWarning: invalid value encountered in true_divide res, _ = _lowess(y, x, x, np.ones_like(x),
var7='PropertyGFABuilding(s)'
Distribution(data[var7],'Distribution de '+var7,'(en milliers de pieds²)',unite=1E3)
Distribution(data[var7].apply(np.log1p),'Distribution de log( '+var7+' +1 )')
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
Cette distribution est plus normale je l'intègre aux data
data[str('log_'+var7)]=data[var7].apply(np.log1p)
rapport_cible(data,var7,cible1)
rapport_cible(data,var7,cible2)
var8='LargestPropertyUseTypeGFA'
Distribution(data[var8],'Distribution de '+var8,'(en milliers de pieds²)',unite=1E3)
Distribution(data[var8].apply(np.log1p),'Distribution de log( '+var8+' +1 )')
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite) <ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'> <class 'int'>
Cette distribution est plus normale je l'intègre aux data
data[str('log_'+var8)]=data[var8].apply(np.log)
rapport_cible(data,var8,cible1)
rapport_cible(data,var8,cible2)
var9='SecondLargestPropertyUseTypeGFA'
Distribution(data[var9],'Distribution de '+var9,'(en milliers de pieds²)',unite=1E3)
Distribution(data[var9].apply(np.log1p),'Distribution de log( '+var9+' +1 )')
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
Cette distribution est plus normale je l'intègre aux data
data[str('log_'+var9)]=data[var9].apply(np.log1p)
rapport_cible(data,var9,cible1)
rapport_cible(data,var9,cible2)
var10='ThirdLargestPropertyUseTypeGFA'
Distribution(data[var10],'Distribution de '+var10,'(en milliers de pieds²)',unite=1E3)
Distribution(data[var10].apply(np.log1p),'Distribution de log( '+var10+' +1 )')
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'float'> <class 'int'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
Cette distribution est plus normale je l'intègre aux data
data[str('log_'+var10)]=data[var10].apply(np.log1p)
rapport_cible(data,var10,cible1)
C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\statsmodels\nonparametric\smoothers_lowess.py:214: RuntimeWarning: invalid value encountered in true_divide res, _ = _lowess(y, x, x, np.ones_like(x),
rapport_cible(data,var10,cible2)
C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\statsmodels\nonparametric\smoothers_lowess.py:214: RuntimeWarning: invalid value encountered in true_divide res, _ = _lowess(y, x, x, np.ones_like(x),
Sauvegarde de Base
# Création de nouvelles variables en divisant les surfaces par le nombre d'étages
data['LargestPropertyUseTypeGFA_per_Floors']=data['LargestPropertyUseTypeGFA']/data['NumberofFloors']
# data['LargestPropertyUseTypeGFA:NumberofBuildings']=data['LargestPropertyUseTypeGFA']/data['NumberofBuildings']
data['PropertyGFABuilding(s)_per_Floors']=data['PropertyGFABuilding(s)']/data['NumberofFloors']
# data['PropertyGFABuilding(s):NumberofBuildings']=data['PropertyGFABuilding(s)']/data['NumberofBuildings']
data['PropertyGFATotal_per_Floors']=data['PropertyGFATotal']/data['NumberofFloors']
# data['PropertyGFATotal:NumberofBuildings']=data['PropertyGFATotal']/data['NumberofBuildings']
data.drop('NumberofFloors',axis=1,inplace=True)
data.shape
(1664, 33)
Distribution(data['PropertyGFATotal_per_Floors'],unite=1)
Distribution(data['PropertyGFATotal_per_Floors'].apply(np.log1p),unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
data['PropertyGFATotal_per_Floors']=data['PropertyGFATotal_per_Floors'].apply(np.log)
Distribution(data['PropertyGFABuilding(s)_per_Floors'],unite=1)
Distribution(data['PropertyGFABuilding(s)_per_Floors'].apply(np.log1p),unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
data['PropertyGFABuilding(s)_per_Floors']=data['PropertyGFABuilding(s)_per_Floors'].apply(np.log)
Distribution(data['PropertyGFATotal_per_Floors'],unite=1)
Distribution(data['PropertyGFATotal_per_Floors'].apply(np.log1p),unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite) <ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'> <class 'int'>
data['PropertyGFATotal_per_Floors']=data['PropertyGFATotal_per_Floors'].apply(np.log)
Création d'une variable comptabilisant le nombre d'usages d'un batiment
usages=['LargestPropertyUseType', 'SecondLargestPropertyUseType',
'ThirdLargestPropertyUseType',
'Use4', 'Use5', 'Use6', 'Use7', 'Use8']
data_use=data[usages]
data_use
| LargestPropertyUseType | SecondLargestPropertyUseType | ThirdLargestPropertyUseType | Use4 | Use5 | Use6 | Use7 | Use8 | |
|---|---|---|---|---|---|---|---|---|
| 272 | Office | None | None | None | None | None | None | None |
| 312 | Office | Parking | Other | Restaurant | None | None | None | None |
| 358 | Office | Parking | None | None | None | None | None | None |
| 405 | Office | Parking | None | None | None | None | None | None |
| 498 | Medical Office | Parking | None | None | None | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6711 | Office | None | None | None | None | None | None | None |
| 6712 | Other - Recreation | None | None | None | None | None | None | None |
| 6713 | Other - Recreation | Fitness Center/Health Club/Gym | Swimming Pool | None | None | None | None | None |
| 6714 | Other - Recreation | Fitness Center/Health Club/Gym | Pre-school/Daycare | Pre-school/Daycare | None | None | None | None |
| 6715 | Other - Recreation | Fitness Center/Health Club/Gym | Pre-school/Daycare | Pre-school/Daycare | None | None | None | None |
1664 rows × 8 columns
data['Nbre_Use']=8-data_use[data_use!= 'None'].isnull().sum(axis=1)
data['Nbre_Use']
272 1
312 4
358 2
405 2
498 2
..
6711 1
6712 1
6713 3
6714 4
6715 4
Name: Nbre_Use, Length: 1664, dtype: int64
Scale de energyuse
# Changement du nom de la variable
data.rename(columns={'SiteEnergyUse(kBtu)': 'SiteEnergyUse'},inplace=True)
Distribution(data['SiteEnergyUse'],'Distribution de SiteEnergyUse','kBtu',unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
# Expression en million de kBtu
data['SiteEnergyUse'] = data['SiteEnergyUse'].apply(lambda x: np.log(x * 1E-6))
Distribution(data['SiteEnergyUse'],'Après transformation au log ',unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
Scale TotalGHGEmissions
Distribution(data['TotalGHGEmissions'],'Distribution de TotalGHGEmissions','tonnes de CO2', unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
data['TotalGHGEmissions'] = data['TotalGHGEmissions'].apply(np.log)
Distribution(data['TotalGHGEmissions'],'Après transformation au log',unite=1)
<ipython-input-56-fa81e029ec3d>:18: UserWarning: FixedFormatter should only be used together with FixedLocator ax2.set_xticklabels(val/unite)
<class 'int'>
Suppression des variables non scalées
data.drop(['PropertyGFATotal',
'PropertyGFAParking',
'PropertyGFABuilding(s)',
'LargestPropertyUseTypeGFA',
'SecondLargestPropertyUseTypeGFA',
'ThirdLargestPropertyUseTypeGFA'],axis=1,inplace=True)
data.describe()
| SiteEnergyUse | TotalGHGEmissions | YearBuilt | ENERGYSTARScore | log_PropertyGFATotal | log_PropertyGFAParking | log_PropertyGFABuilding(s) | log_LargestPropertyUseTypeGFA | log_SecondLargestPropertyUseTypeGFA | log_ThirdLargestPropertyUseTypeGFA | LargestPropertyUseTypeGFA_per_Floors | PropertyGFABuilding(s)_per_Floors | PropertyGFATotal_per_Floors | Nbre_Use | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1664.000000 | 1664.000000 | 1664.000000 | 1103.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 |
| mean | 1.056109 | 3.906402 | 54.319712 | 65.541251 | 10.996597 | 2.135837 | 10.915144 | 10.806396 | 4.535550 | 1.737606 | 28988.665580 | 9.959861 | 2.303683 | 1.788462 |
| std | 1.292575 | 1.503565 | 32.864194 | 28.359404 | 0.946555 | 4.248553 | 0.897681 | 0.949069 | 4.912377 | 3.539107 | 45232.377354 | 0.791661 | 0.077696 | 0.945139 |
| min | -4.085847 | -2.120264 | 1.000000 | 1.000000 | 8.366603 | 0.000000 | 8.198914 | 7.956687 | 0.000000 | 0.000000 | 221.696970 | 5.401311 | 1.686642 | 1.000000 |
| 25% | 0.204679 | 2.989463 | 27.000000 | 49.000000 | 10.260182 | 0.000000 | 10.225372 | 10.110502 | 0.000000 | 0.000000 | 10685.520833 | 9.398955 | 2.248089 | 1.000000 |
| 50% | 0.909161 | 3.895385 | 51.000000 | 73.000000 | 10.764339 | 0.000000 | 10.722967 | 10.638388 | 0.000000 | 0.000000 | 18470.000000 | 9.928936 | 2.303631 | 2.000000 |
| 75% | 1.909228 | 4.902976 | 86.000000 | 89.000000 | 11.517653 | 0.000000 | 11.411006 | 11.377641 | 9.448727 | 0.000000 | 30803.297619 | 10.386377 | 2.350730 | 2.000000 |
| max | 6.105653 | 9.733350 | 116.000000 | 100.000000 | 14.603968 | 13.147269 | 14.603968 | 14.357627 | 13.369117 | 13.038436 | 859821.500000 | 13.910821 | 2.632667 | 8.000000 |
msno.matrix(data,color=bleu)
<AxesSubplot:>
data.shape
(1664, 28)
def get_feature_correlation(df,target=None, top_n=None, corr_method='spearman',
remove_duplicates=True, remove_self_correlations=True):
"""
Compute the feature correlation and sort feature pairs based on their correlation
:param df: The dataframe with the predictor variables
:type df: pandas.core.frame.DataFrame
:param top_n: Top N feature pairs to be reported (if None, all of the pairs will be returned)
:param corr_method: Correlation compuation method
:type corr_method: str
:param remove_duplicates: Indicates whether duplicate features must be removed
:type remove_duplicates: bool
:param remove_self_correlations: Indicates whether self correlations will be removed
:type remove_self_correlations: bool
:return: pandas.core.frame.DataFrame
"""
corr_matrix_abs = df.corr(method=corr_method).abs()
# display(corr_matrix_abs )
corr_matrix_abs_us = corr_matrix_abs.unstack()
sorted_correlated_features = corr_matrix_abs_us \
.sort_values(kind="quicksort", ascending=False) \
.reset_index()
# just keep corrélation to target
if target:
corr_matrix_abs.drop(target,axis=0,inplace=True)
to_target=corr_matrix_abs[target].sort_values(kind="quicksort", ascending=False).reset_index()
to_target.columns = [target, 'Correlation (abs)']
return to_target
# Remove comparisons of the same feature
if remove_self_correlations:
sorted_correlated_features = sorted_correlated_features[
(sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
]
# Remove duplicates
if remove_duplicates:
sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]
# Create meaningful names for the columns
sorted_correlated_features.columns = ['Feature 1', 'Feature 2', 'Correlation (abs)']
if top_n:
return sorted_correlated_features[:top_n]
return sorted_correlated_features
quant=data.select_dtypes('number')
quant=data.select_dtypes('number')
correlations_energy=get_feature_correlation(quant,target='SiteEnergyUse')
correlations_ges=get_feature_correlation(quant,target='TotalGHGEmissions')
display(correlations_energy)
correlations_ges
| SiteEnergyUse | Correlation (abs) | |
|---|---|---|
| 0 | TotalGHGEmissions | 0.866207 |
| 1 | log_PropertyGFATotal | 0.733724 |
| 2 | log_PropertyGFABuilding(s) | 0.715881 |
| 3 | log_LargestPropertyUseTypeGFA | 0.696671 |
| 4 | log_PropertyGFAParking | 0.365840 |
| 5 | log_SecondLargestPropertyUseTypeGFA | 0.350302 |
| 6 | PropertyGFATotal_per_Floors | 0.344050 |
| 7 | LargestPropertyUseTypeGFA_per_Floors | 0.275835 |
| 8 | YearBuilt | 0.268536 |
| 9 | PropertyGFABuilding(s)_per_Floors | 0.260242 |
| 10 | Nbre_Use | 0.247116 |
| 11 | log_ThirdLargestPropertyUseTypeGFA | 0.216734 |
| 12 | ENERGYSTARScore | 0.174871 |
| TotalGHGEmissions | Correlation (abs) | |
|---|---|---|
| 0 | SiteEnergyUse | 0.866207 |
| 1 | log_PropertyGFABuilding(s) | 0.569197 |
| 2 | log_PropertyGFATotal | 0.568170 |
| 3 | log_LargestPropertyUseTypeGFA | 0.552047 |
| 4 | PropertyGFATotal_per_Floors | 0.310408 |
| 5 | LargestPropertyUseTypeGFA_per_Floors | 0.266363 |
| 6 | PropertyGFABuilding(s)_per_Floors | 0.263469 |
| 7 | log_SecondLargestPropertyUseTypeGFA | 0.233988 |
| 8 | log_PropertyGFAParking | 0.213037 |
| 9 | Nbre_Use | 0.163429 |
| 10 | YearBuilt | 0.162071 |
| 11 | log_ThirdLargestPropertyUseTypeGFA | 0.156840 |
| 12 | ENERGYSTARScore | 0.141398 |
correlations_energy=correlations_energy[correlations_energy['Correlation (abs)']<.7]
correlations_ges=correlations_ges[correlations_ges['Correlation (abs)']<.7]
display(correlations_energy)
correlations_ges
| SiteEnergyUse | Correlation (abs) | |
|---|---|---|
| 3 | log_LargestPropertyUseTypeGFA | 0.696671 |
| 4 | log_PropertyGFAParking | 0.365840 |
| 5 | log_SecondLargestPropertyUseTypeGFA | 0.350302 |
| 6 | PropertyGFATotal_per_Floors | 0.344050 |
| 7 | LargestPropertyUseTypeGFA_per_Floors | 0.275835 |
| 8 | YearBuilt | 0.268536 |
| 9 | PropertyGFABuilding(s)_per_Floors | 0.260242 |
| 10 | Nbre_Use | 0.247116 |
| 11 | log_ThirdLargestPropertyUseTypeGFA | 0.216734 |
| 12 | ENERGYSTARScore | 0.174871 |
| TotalGHGEmissions | Correlation (abs) | |
|---|---|---|
| 1 | log_PropertyGFABuilding(s) | 0.569197 |
| 2 | log_PropertyGFATotal | 0.568170 |
| 3 | log_LargestPropertyUseTypeGFA | 0.552047 |
| 4 | PropertyGFATotal_per_Floors | 0.310408 |
| 5 | LargestPropertyUseTypeGFA_per_Floors | 0.266363 |
| 6 | PropertyGFABuilding(s)_per_Floors | 0.263469 |
| 7 | log_SecondLargestPropertyUseTypeGFA | 0.233988 |
| 8 | log_PropertyGFAParking | 0.213037 |
| 9 | Nbre_Use | 0.163429 |
| 10 | YearBuilt | 0.162071 |
| 11 | log_ThirdLargestPropertyUseTypeGFA | 0.156840 |
| 12 | ENERGYSTARScore | 0.141398 |
correlations_energy['SiteEnergyUse'].sort_values().to_list()==correlations_ges['TotalGHGEmissions'].sort_values().to_list()
False
sel_feat=['SiteEnergyUse','TotalGHGEmissions']
sel_feat.extend(correlations_energy['SiteEnergyUse'].sort_values().to_list())
Pour Siteenergyuse
cat = data.select_dtypes('object').columns.to_list()
Energy_eta2 = pd.DataFrame(columns=['Source','ddof1','ddof2','F', 'p-unc', 'np2'])
for col in cat:
etaE = pg.welch_anova(data=data, dv='SiteEnergyUse', between=col)
Energy_eta2 = pd.concat([Energy_eta2, etaE],axis=0)
# Energy_eta2.columns = ['Energy /Variable', 'p value', 'np2']
display(Energy_eta2)
print('Elimination des p_values supérieures à 0.05')
usefull_cat_ENR = Energy_eta2[Energy_eta2['p-unc'] < .05].sort_values(
by='np2', ascending=False)
usefull_cat_ENR
C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\pingouin\parametric.py:1330: RuntimeWarning: divide by zero encountered in double_scalars pval = f.sf(fval, ddof1, 1 / lamb) C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\pingouin\parametric.py:1336: RuntimeWarning: divide by zero encountered in double_scalars 'ddof2': 1 / lamb,
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | BuildingType | 4 | 1.289454e+02 | 0.865318 | 4.867584e-01 | 0.003206 |
| 0 | PrimaryPropertyType | 19 | 1.883967e+02 | 82.129843 | 2.056229e-80 | 0.384248 |
| 0 | PropertyName | 1657 | 1.972258e+05 | 0.007898 | 1.000000e+00 | 0.998876 |
| 0 | CouncilDistrictCode | 6 | 5.006442e+02 | 22.235397 | 3.002656e-23 | 0.076394 |
| 0 | Neighborhood | 12 | 3.716870e+02 | 14.194463 | 2.318412e-24 | 0.100105 |
| 0 | LargestPropertyUseType | 55 | 8.472567e+01 | 16.003617 | 1.365710e-27 | 0.290406 |
| 0 | SecondLargestPropertyUseType | 47 | 5.543685e+01 | 3.838447 | 1.315425e-06 | 0.145475 |
| 0 | ThirdLargestPropertyUseType | 39 | 4.177240e+01 | 9.937238 | 1.026352e-11 | 0.095020 |
| 0 | Outlier | 2 | 2.604375e+00 | 91.175770 | 3.883079e-03 | 0.054218 |
| 0 | Use4 | 16 | 1.737988e+01 | 2.529046 | 3.200699e-02 | 0.040232 |
| 0 | Use5 | 10 | 1.353492e+01 | 3.103233 | 2.781388e-02 | 0.020415 |
| 0 | Use6 | 6 | inf | 0.000000 | 1.000000e+00 | 0.009945 |
| 0 | Use7 | 4 | inf | 0.000000 | 1.000000e+00 | 0.006719 |
| 0 | Use8 | 3 | inf | 0.000000 | 1.000000e+00 | 0.006595 |
Elimination des p_values supérieures à 0.05
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | PrimaryPropertyType | 19 | 188.396685 | 82.129843 | 2.056229e-80 | 0.384248 |
| 0 | LargestPropertyUseType | 55 | 84.725670 | 16.003617 | 1.365710e-27 | 0.290406 |
| 0 | SecondLargestPropertyUseType | 47 | 55.436849 | 3.838447 | 1.315425e-06 | 0.145475 |
| 0 | Neighborhood | 12 | 371.686996 | 14.194463 | 2.318412e-24 | 0.100105 |
| 0 | ThirdLargestPropertyUseType | 39 | 41.772404 | 9.937238 | 1.026352e-11 | 0.095020 |
| 0 | CouncilDistrictCode | 6 | 500.644156 | 22.235397 | 3.002656e-23 | 0.076394 |
| 0 | Outlier | 2 | 2.604375 | 91.175770 | 3.883079e-03 | 0.054218 |
| 0 | Use4 | 16 | 17.379876 | 2.529046 | 3.200699e-02 | 0.040232 |
| 0 | Use5 | 10 | 13.534922 | 3.103233 | 2.781388e-02 | 0.020415 |
Pour GES
cat = data.select_dtypes('object').columns.to_list()
Ges_eta2 = pd.DataFrame(columns=['Source','ddof1','ddof2','F', 'p-unc', 'np2'])
for col in cat:
etaE = pg.welch_anova(data=data, dv='TotalGHGEmissions', between=col)
Ges_eta2 = pd.concat([Ges_eta2, etaE],axis=0)
# Energy_eta2.columns = ['Energy /Variable', 'p value', 'np2']
display(Ges_eta2)
print('Elimination des p_values supérieures à 0.05')
usefull_cat_GES = Ges_eta2[Ges_eta2['p-unc'] < .05].sort_values(
by='np2', ascending=False)
usefull_cat_GES
C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\pingouin\parametric.py:1330: RuntimeWarning: divide by zero encountered in double_scalars pval = f.sf(fval, ddof1, 1 / lamb) C:\Users\demo\anaconda3\envs\Python 3 Tabnine\lib\site-packages\pingouin\parametric.py:1336: RuntimeWarning: divide by zero encountered in double_scalars 'ddof2': 1 / lamb,
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | BuildingType | 4 | 1.281407e+02 | 4.741179 | 1.336992e-03 | 0.009880 |
| 0 | PrimaryPropertyType | 19 | 1.876113e+02 | 43.119707 | 4.617300e-58 | 0.294230 |
| 0 | PropertyName | 1657 | 2.156350e+05 | 0.003353 | 1.000000e+00 | 0.997644 |
| 0 | CouncilDistrictCode | 6 | 4.983386e+02 | 14.549101 | 2.492572e-15 | 0.051260 |
| 0 | Neighborhood | 12 | 3.710607e+02 | 8.280259 | 7.051061e-14 | 0.060308 |
| 0 | LargestPropertyUseType | 55 | 8.482623e+01 | 14.735150 | 2.566104e-26 | 0.244130 |
| 0 | SecondLargestPropertyUseType | 47 | 5.827722e+01 | 2.073622 | 4.229841e-03 | 0.073498 |
| 0 | ThirdLargestPropertyUseType | 39 | 4.180814e+01 | 4.200177 | 5.585080e-06 | 0.058123 |
| 0 | Outlier | 2 | 2.560636e+00 | 48.841302 | 9.137514e-03 | 0.053676 |
| 0 | Use4 | 16 | 1.746497e+01 | 4.186107 | 2.502938e-03 | 0.027806 |
| 0 | Use5 | 10 | 1.370961e+01 | 6.324368 | 1.157912e-03 | 0.016497 |
| 0 | Use6 | 6 | inf | 0.000000 | 1.000000e+00 | 0.008526 |
| 0 | Use7 | 4 | inf | 0.000000 | 1.000000e+00 | 0.005270 |
| 0 | Use8 | 3 | inf | 0.000000 | 1.000000e+00 | 0.004969 |
Elimination des p_values supérieures à 0.05
| Source | ddof1 | ddof2 | F | p-unc | np2 | |
|---|---|---|---|---|---|---|
| 0 | PrimaryPropertyType | 19 | 187.611277 | 43.119707 | 4.617300e-58 | 0.294230 |
| 0 | LargestPropertyUseType | 55 | 84.826226 | 14.735150 | 2.566104e-26 | 0.244130 |
| 0 | SecondLargestPropertyUseType | 47 | 58.277215 | 2.073622 | 4.229841e-03 | 0.073498 |
| 0 | Neighborhood | 12 | 371.060718 | 8.280259 | 7.051061e-14 | 0.060308 |
| 0 | ThirdLargestPropertyUseType | 39 | 41.808143 | 4.200177 | 5.585080e-06 | 0.058123 |
| 0 | Outlier | 2 | 2.560636 | 48.841302 | 9.137514e-03 | 0.053676 |
| 0 | CouncilDistrictCode | 6 | 498.338574 | 14.549101 | 2.492572e-15 | 0.051260 |
| 0 | Use4 | 16 | 17.464969 | 4.186107 | 2.502938e-03 | 0.027806 |
| 0 | Use5 | 10 | 13.709606 | 6.324368 | 1.157912e-03 | 0.016497 |
| 0 | BuildingType | 4 | 128.140683 | 4.741179 | 1.336992e-03 | 0.009880 |
La variable BuildingType n'est pas retenue comme pertinenete dans l'explication de la variance de SiteEnergyUse je choisisde la conserver quand même.
sel_feat.extend(usefull_cat_GES['Source'])
data[sel_feat].head(0).T
| SiteEnergyUse |
|---|
| TotalGHGEmissions |
| ENERGYSTARScore |
| LargestPropertyUseTypeGFA_per_Floors |
| Nbre_Use |
| PropertyGFABuilding(s)_per_Floors |
| PropertyGFATotal_per_Floors |
| YearBuilt |
| log_LargestPropertyUseTypeGFA |
| log_PropertyGFAParking |
| log_SecondLargestPropertyUseTypeGFA |
| log_ThirdLargestPropertyUseTypeGFA |
| PrimaryPropertyType |
| LargestPropertyUseType |
| SecondLargestPropertyUseType |
| Neighborhood |
| ThirdLargestPropertyUseType |
| Outlier |
| CouncilDistrictCode |
| Use4 |
| Use5 |
| BuildingType |
data[sel_feat].shape
(1664, 22)
data[sel_feat].describe()
| SiteEnergyUse | TotalGHGEmissions | ENERGYSTARScore | LargestPropertyUseTypeGFA_per_Floors | Nbre_Use | PropertyGFABuilding(s)_per_Floors | PropertyGFATotal_per_Floors | YearBuilt | log_LargestPropertyUseTypeGFA | log_PropertyGFAParking | log_SecondLargestPropertyUseTypeGFA | log_ThirdLargestPropertyUseTypeGFA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1664.000000 | 1664.000000 | 1103.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 | 1664.000000 |
| mean | 1.056109 | 3.906402 | 65.541251 | 28988.665580 | 1.788462 | 9.959861 | 2.303683 | 54.319712 | 10.806396 | 2.135837 | 4.535550 | 1.737606 |
| std | 1.292575 | 1.503565 | 28.359404 | 45232.377354 | 0.945139 | 0.791661 | 0.077696 | 32.864194 | 0.949069 | 4.248553 | 4.912377 | 3.539107 |
| min | -4.085847 | -2.120264 | 1.000000 | 221.696970 | 1.000000 | 5.401311 | 1.686642 | 1.000000 | 7.956687 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.204679 | 2.989463 | 49.000000 | 10685.520833 | 1.000000 | 9.398955 | 2.248089 | 27.000000 | 10.110502 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.909161 | 3.895385 | 73.000000 | 18470.000000 | 2.000000 | 9.928936 | 2.303631 | 51.000000 | 10.638388 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 1.909228 | 4.902976 | 89.000000 | 30803.297619 | 2.000000 | 10.386377 | 2.350730 | 86.000000 | 11.377641 | 0.000000 | 9.448727 | 0.000000 |
| max | 6.105653 | 9.733350 | 100.000000 | 859821.500000 | 8.000000 | 13.910821 | 2.632667 | 116.000000 | 14.357627 | 13.147269 | 13.369117 | 13.038436 |
data[sel_feat].to_json('Buildings_STAR_modelisation.json')